1 |
44133
|
michele.ar
|
package eu.dnetlib.data.mdstore.plugins;
|
2 |
|
|
|
3 |
49368
|
michele.ar
|
import java.io.StringReader;
|
4 |
49210
|
michele.ar
|
import java.net.URI;
|
5 |
|
|
import java.net.URISyntaxException;
|
6 |
55934
|
michele.ar
|
import java.util.ArrayList;
|
7 |
|
|
import java.util.Arrays;
|
8 |
55101
|
michele.ar
|
import java.util.HashMap;
|
9 |
49368
|
michele.ar
|
import java.util.List;
|
10 |
51852
|
michele.ar
|
import java.util.Map;
|
11 |
49380
|
michele.ar
|
import java.util.Set;
|
12 |
51020
|
michele.ar
|
import java.util.stream.Collectors;
|
13 |
44133
|
michele.ar
|
|
14 |
59332
|
michele.ar
|
import org.apache.commons.lang3.StringUtils;
|
15 |
49368
|
michele.ar
|
import org.apache.commons.logging.Log;
|
16 |
|
|
import org.apache.commons.logging.LogFactory;
|
17 |
49210
|
michele.ar
|
import org.dom4j.Document;
|
18 |
49368
|
michele.ar
|
import org.dom4j.DocumentException;
|
19 |
49380
|
michele.ar
|
import org.dom4j.Element;
|
20 |
49368
|
michele.ar
|
import org.dom4j.Node;
|
21 |
|
|
import org.dom4j.io.SAXReader;
|
22 |
60864
|
michele.ar
|
import org.springframework.beans.factory.annotation.Autowired;
|
23 |
49368
|
michele.ar
|
import org.springframework.beans.factory.annotation.Value;
|
24 |
44133
|
michele.ar
|
|
25 |
55934
|
michele.ar
|
import com.google.common.base.Splitter;
|
26 |
60864
|
michele.ar
|
import com.mongodb.MongoClient;
|
27 |
|
|
import com.mongodb.client.MongoCollection;
|
28 |
55934
|
michele.ar
|
|
29 |
51020
|
michele.ar
|
import eu.dnetlib.data.mdstore.plugins.objects.MdRecord;
|
30 |
49416
|
michele.ar
|
import eu.dnetlib.data.mdstore.plugins.objects.MyURL;
|
31 |
51020
|
michele.ar
|
import eu.dnetlib.data.mdstore.plugins.objects.Project;
|
32 |
61487
|
michele.ar
|
import eu.dnetlib.data.utils.XsltFunctions;
|
33 |
49416
|
michele.ar
|
|
34 |
49291
|
michele.ar
|
public class EnrichOpenairePlugin extends GenericDoiMdstorePlugin {
|
35 |
49210
|
michele.ar
|
|
36 |
49368
|
michele.ar
|
private static final Log log = LogFactory.getLog(EnrichOpenairePlugin.class);
|
37 |
|
|
|
38 |
|
|
@Value("${plugin.enrich.publications.openaire.url}")
|
39 |
|
|
private String baseUrl;
|
40 |
|
|
|
41 |
60102
|
michele.ar
|
@Value("${plugin.enrich.openaire.datasources.blacklist}")
|
42 |
|
|
private String datasourceBlackList;
|
43 |
|
|
|
44 |
60864
|
michele.ar
|
@Autowired
|
45 |
|
|
private MongoClient mongoClient;
|
46 |
|
|
|
47 |
55101
|
michele.ar
|
private Map<String, Counter> counters = new HashMap<>();
|
48 |
|
|
|
49 |
49291
|
michele.ar
|
@Override
|
50 |
|
|
protected URI prepareURI(final String doi) throws URISyntaxException {
|
51 |
49368
|
michele.ar
|
return new URI(String.format(baseUrl, doi));
|
52 |
49291
|
michele.ar
|
}
|
53 |
44133
|
michele.ar
|
|
54 |
|
|
@Override
|
55 |
60864
|
michele.ar
|
protected MongoCollection<org.bson.Document> getCacheCollection() {
|
56 |
|
|
return mongoClient.getDatabase("API_CACHES").getCollection("OPENAIRE_API_CACHE");
|
57 |
|
|
}
|
58 |
|
|
|
59 |
|
|
@Override
|
60 |
55101
|
michele.ar
|
protected void reconfigure(final Map<String, String> params) {
|
61 |
|
|
counters.clear();
|
62 |
|
|
counters.put("subjects", new Counter());
|
63 |
|
|
counters.put("citations", new Counter());
|
64 |
|
|
counters.put("urls", new Counter());
|
65 |
|
|
counters.put("projects", new Counter());
|
66 |
59928
|
michele.ar
|
counters.put("dois", new Counter());
|
67 |
55101
|
michele.ar
|
}
|
68 |
44133
|
michele.ar
|
|
69 |
51020
|
michele.ar
|
@Override
|
70 |
55101
|
michele.ar
|
protected void resetConfiguration() {
|
71 |
|
|
log.info("***** Openaire Enrichment - subjects : " + counters.get("subjects"));
|
72 |
|
|
log.info("***** Openaire Enrichment - citations : " + counters.get("citations"));
|
73 |
|
|
log.info("***** Openaire Enrichment - urls : " + counters.get("urls"));
|
74 |
|
|
log.info("***** Openaire Enrichment - projects : " + counters.get("projects"));
|
75 |
59928
|
michele.ar
|
log.info("***** Openaire Enrichment - dois : " + counters.get("dois"));
|
76 |
55101
|
michele.ar
|
counters.clear();
|
77 |
|
|
}
|
78 |
54968
|
michele.ar
|
|
79 |
|
|
@Override
|
80 |
51020
|
michele.ar
|
protected boolean updateDocument(final MdRecord doc, final String response) {
|
81 |
55101
|
michele.ar
|
counters.get("subjects").incrementBefore(doc.getSubjects().size());
|
82 |
|
|
counters.get("citations").incrementBefore(doc.getCitations().size());
|
83 |
|
|
counters.get("urls").incrementBefore(doc.getUrls().size());
|
84 |
|
|
counters.get("projects").incrementBefore(doc.getProjects().size());
|
85 |
59928
|
michele.ar
|
counters.get("dois").incrementBefore(doc.getDois().size());
|
86 |
51020
|
michele.ar
|
|
87 |
49368
|
michele.ar
|
try {
|
88 |
|
|
final Document docRes = (new SAXReader()).read(new StringReader(response));
|
89 |
49210
|
michele.ar
|
|
90 |
49368
|
michele.ar
|
final List<?> results = docRes.selectNodes("/response/results/result");
|
91 |
44133
|
michele.ar
|
|
92 |
49368
|
michele.ar
|
if (results.size() == 1) {
|
93 |
|
|
final Node n = (Node) results.get(0);
|
94 |
49380
|
michele.ar
|
updateSubjects(doc, n);
|
95 |
|
|
updateCitations(doc, n);
|
96 |
|
|
updateUrls(doc, n);
|
97 |
49395
|
michele.ar
|
updateProjects(doc, n);
|
98 |
59928
|
michele.ar
|
updateDois(doc, n);
|
99 |
51020
|
michele.ar
|
updateBestRights(doc);
|
100 |
55101
|
michele.ar
|
|
101 |
49368
|
michele.ar
|
return true;
|
102 |
|
|
} else if (results.size() == 1) {
|
103 |
|
|
log.warn("Too many responses");
|
104 |
|
|
}
|
105 |
|
|
} catch (final DocumentException e) {
|
106 |
|
|
log.warn("Invalid response", e);
|
107 |
55101
|
michele.ar
|
} finally {
|
108 |
|
|
counters.get("subjects").incrementAfter(doc.getSubjects().size());
|
109 |
|
|
counters.get("citations").incrementAfter(doc.getCitations().size());
|
110 |
|
|
counters.get("urls").incrementAfter(doc.getUrls().size());
|
111 |
|
|
counters.get("projects").incrementAfter(doc.getProjects().size());
|
112 |
59928
|
michele.ar
|
counters.get("dois").incrementAfter(doc.getDois().size());
|
113 |
49368
|
michele.ar
|
}
|
114 |
|
|
|
115 |
|
|
return false;
|
116 |
49210
|
michele.ar
|
}
|
117 |
49380
|
michele.ar
|
|
118 |
51020
|
michele.ar
|
private void updateSubjects(final MdRecord doc, final Node n) {
|
119 |
55934
|
michele.ar
|
final Set<String> subjects = doc.getSubjects()
|
120 |
|
|
.stream()
|
121 |
|
|
.map(EnrichOpenairePlugin::cleanSubject)
|
122 |
|
|
.flatMap(List::stream)
|
123 |
|
|
.collect(Collectors.toSet());
|
124 |
|
|
|
125 |
49380
|
michele.ar
|
for (final Object o : n.selectNodes(".//subject[@classid='keyword']")) {
|
126 |
55934
|
michele.ar
|
subjects.addAll(cleanSubject(((Node) o).getText().trim()));
|
127 |
49380
|
michele.ar
|
}
|
128 |
55934
|
michele.ar
|
|
129 |
51020
|
michele.ar
|
doc.setSubjects(subjects);
|
130 |
49380
|
michele.ar
|
}
|
131 |
|
|
|
132 |
55934
|
michele.ar
|
public static List<String> cleanSubject(final String s) {
|
133 |
|
|
if (s.isEmpty()) {
|
134 |
|
|
return new ArrayList<>();
|
135 |
|
|
} else if (s.startsWith("info:eu-repo/classification/msc/")) {
|
136 |
|
|
return new ArrayList<>();
|
137 |
|
|
} else if (s.startsWith("info:eu-repo/classification/acm/")) {
|
138 |
|
|
return Arrays.asList(s.replaceFirst("info:eu-repo/classification/acm/", ""));
|
139 |
|
|
} else if (s.contains(";")) {
|
140 |
|
|
return Splitter.on(";").trimResults().omitEmptyStrings().splitToList(s);
|
141 |
|
|
} else if (s.contains(",")) {
|
142 |
|
|
return Splitter.on(",").trimResults().omitEmptyStrings().splitToList(s);
|
143 |
|
|
} else {
|
144 |
|
|
return Arrays.asList(s);
|
145 |
|
|
}
|
146 |
49804
|
michele.ar
|
}
|
147 |
|
|
|
148 |
51020
|
michele.ar
|
private void updateCitations(final MdRecord doc, final Node n) {
|
149 |
|
|
doc.getCitations().clear();
|
150 |
49380
|
michele.ar
|
|
151 |
|
|
for (final Object o : n.selectNodes(".//citations/citation/rawText")) {
|
152 |
51020
|
michele.ar
|
doc.getCitations().add(((Node) o).getText());
|
153 |
49380
|
michele.ar
|
}
|
154 |
62539
|
michele.ar
|
for (final Object o : n.selectNodes(".//references/reference/rawText")) {
|
155 |
|
|
doc.getCitations().add(((Node) o).getText());
|
156 |
|
|
}
|
157 |
49380
|
michele.ar
|
}
|
158 |
|
|
|
159 |
51020
|
michele.ar
|
private void updateUrls(final MdRecord doc, final Node n) {
|
160 |
|
|
doc.getUrls().addAll(doc.getUrls());
|
161 |
49380
|
michele.ar
|
|
162 |
60102
|
michele.ar
|
final Set<String> blacklist =
|
163 |
|
|
Arrays.stream(datasourceBlackList.split(",")).map(String::trim).filter(StringUtils::isNotBlank).collect(Collectors.toSet());
|
164 |
|
|
|
165 |
49380
|
michele.ar
|
for (final Object oin : n.selectNodes(".//instance")) {
|
166 |
60102
|
michele.ar
|
|
167 |
|
|
final String hostedByid = ((Element) oin).valueOf("./hostedby/@id").trim();
|
168 |
|
|
|
169 |
|
|
if (!blacklist.contains(hostedByid)) {
|
170 |
61593
|
michele.ar
|
final String hostedBy = ((Element) oin).valueOf("./hostedby/@name").trim();
|
171 |
60102
|
michele.ar
|
final String rights = ((Element) oin).valueOf("./accessright/@classname").trim();
|
172 |
|
|
|
173 |
|
|
for (final Object ourl : ((Element) oin).selectNodes("./webresource/url")) {
|
174 |
61593
|
michele.ar
|
final String url = ((Node) ourl).getText().trim();
|
175 |
|
|
final String name =
|
176 |
|
|
hostedBy.equalsIgnoreCase("Unknown") || hostedBy.equalsIgnoreCase("Unknown Repository") ? XsltFunctions.serverName(url) : hostedBy;
|
177 |
|
|
final MyURL u = new MyURL(url, name, rights);
|
178 |
60102
|
michele.ar
|
doc.getUrls().remove(u);
|
179 |
|
|
doc.getUrls().add(u);
|
180 |
|
|
}
|
181 |
49380
|
michele.ar
|
}
|
182 |
|
|
}
|
183 |
|
|
}
|
184 |
49395
|
michele.ar
|
|
185 |
51020
|
michele.ar
|
private void updateProjects(final MdRecord doc, final Node n) {
|
186 |
49416
|
michele.ar
|
|
187 |
49395
|
michele.ar
|
for (final Object op : n.selectNodes(".//rels/rel[./to/@type='project']")) {
|
188 |
|
|
final Node p = (Node) op;
|
189 |
59332
|
michele.ar
|
final String name = p.valueOf("./title").trim();
|
190 |
49395
|
michele.ar
|
|
191 |
59332
|
michele.ar
|
if (StringUtils.isNotBlank(name) && !name.equalsIgnoreCase("null") && !name.equalsIgnoreCase("unidentified") && !name.equalsIgnoreCase("unknown")) {
|
192 |
|
|
final Project np = new Project();
|
193 |
|
|
np.setOpenaireId(p.valueOf("./to"));
|
194 |
|
|
np.setCode(p.valueOf("./code"));
|
195 |
|
|
np.setName(name);
|
196 |
|
|
np.setAcronym(p.valueOf("./acronym"));
|
197 |
|
|
np.setFunder(p.valueOf(".//funder/@shortname"));
|
198 |
|
|
np.setProgram(p.valueOf(".//funding_level_0/@name"));
|
199 |
|
|
np.setJurisdiction(p.valueOf(".//funder/@jurisdiction"));
|
200 |
61487
|
michele.ar
|
np.setInfoId(XsltFunctions.projectLongId(np.getFunder(), np.getProgram(), np.getCode(), np.getJurisdiction(),
|
201 |
59332
|
michele.ar
|
np.getName(), np.getAcronym()));
|
202 |
|
|
|
203 |
|
|
doc.getProjects().add(np);
|
204 |
|
|
}
|
205 |
49395
|
michele.ar
|
}
|
206 |
50059
|
michele.ar
|
}
|
207 |
49395
|
michele.ar
|
|
208 |
59928
|
michele.ar
|
private void updateDois(final MdRecord doc, final Node n) {
|
209 |
59937
|
michele.ar
|
for (final Object od : n.selectNodes(".//*[local-name()='result']/pid[@classid='doi']")) {
|
210 |
61531
|
michele.ar
|
final String doi = XsltFunctions.cleanDoi(((Node) od).getText());
|
211 |
59928
|
michele.ar
|
if (StringUtils.isNotBlank(doi)) {
|
212 |
|
|
doc.getDois().add(doi);
|
213 |
|
|
}
|
214 |
|
|
}
|
215 |
|
|
}
|
216 |
|
|
|
217 |
51020
|
michele.ar
|
private void updateBestRights(final MdRecord doc) {
|
218 |
|
|
final Set<String> availables = doc.getUrls().stream().map(MyURL::getRights).map(String::toUpperCase).collect(Collectors.toSet());
|
219 |
50059
|
michele.ar
|
if (availables.contains("OPEN ACCESS")) {
|
220 |
51020
|
michele.ar
|
doc.setBestRights("Open Access");
|
221 |
50059
|
michele.ar
|
} else if (availables.contains("EMBARGO")) {
|
222 |
51020
|
michele.ar
|
doc.setBestRights("Embargo");
|
223 |
50059
|
michele.ar
|
} else if (availables.contains("RESTRICTED")) {
|
224 |
51020
|
michele.ar
|
doc.setBestRights("Restricted");
|
225 |
50059
|
michele.ar
|
} else if (availables.contains("CLOSED ACCESS")) {
|
226 |
51020
|
michele.ar
|
doc.setBestRights("Closed Access");
|
227 |
50059
|
michele.ar
|
} else {
|
228 |
51020
|
michele.ar
|
doc.setBestRights("Unknown");
|
229 |
50059
|
michele.ar
|
}
|
230 |
49395
|
michele.ar
|
}
|
231 |
|
|
|
232 |
44133
|
michele.ar
|
}
|