1
|
package eu.dnetlib.data.mdstore.plugins;
|
2
|
|
3
|
import java.io.StringReader;
|
4
|
import java.util.ArrayList;
|
5
|
import java.util.HashMap;
|
6
|
import java.util.List;
|
7
|
import java.util.Map;
|
8
|
import java.util.Optional;
|
9
|
import java.util.regex.Matcher;
|
10
|
import java.util.regex.Pattern;
|
11
|
import java.util.stream.Collectors;
|
12
|
|
13
|
import org.apache.commons.lang3.math.NumberUtils;
|
14
|
import org.apache.commons.lang3.text.WordUtils;
|
15
|
import org.apache.commons.logging.Log;
|
16
|
import org.apache.commons.logging.LogFactory;
|
17
|
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
|
18
|
import org.dom4j.Document;
|
19
|
import org.dom4j.DocumentException;
|
20
|
import org.dom4j.Element;
|
21
|
import org.dom4j.Node;
|
22
|
import org.dom4j.io.SAXReader;
|
23
|
import org.springframework.beans.factory.annotation.Value;
|
24
|
|
25
|
import com.google.common.base.Splitter;
|
26
|
import com.google.common.collect.Lists;
|
27
|
import com.mongodb.BasicDBObject;
|
28
|
import com.mongodb.DBObject;
|
29
|
import com.mongodb.client.MongoCollection;
|
30
|
|
31
|
import eu.dnetlib.clients.pimpa.DataRange;
|
32
|
import eu.dnetlib.clients.pimpa.Laboratory;
|
33
|
import eu.dnetlib.clients.pimpa.Person;
|
34
|
import eu.dnetlib.clients.pimpa.PimpaService;
|
35
|
import eu.dnetlib.clients.pimpa.Year;
|
36
|
import eu.dnetlib.data.mdstore.modular.mongodb.MongoMDStore;
|
37
|
import eu.dnetlib.data.mdstore.plugins.objects.CnrAuthor;
|
38
|
import eu.dnetlib.rmi.data.MDStoreServiceException;
|
39
|
|
40
|
public class EnrichLabsPlugin extends AbstractIstiMDStorePlugin {
|
41
|
|
42
|
private static final Log log = LogFactory.getLog(EnrichLabsPlugin.class);
|
43
|
|
44
|
@Value("${plugin.enrich.labs.pimpa.url}")
|
45
|
private String serviceUrl;
|
46
|
|
47
|
@SuppressWarnings("unchecked")
|
48
|
@Override
|
49
|
public void process(final MongoMDStore store, final Map<String, String> params) throws MDStoreServiceException {
|
50
|
log.warn("********************************************");
|
51
|
log.warn("* ENRICH LABS");
|
52
|
log.warn("********************************************");
|
53
|
|
54
|
final DataRange res = findDataRange(1980, 2017);
|
55
|
|
56
|
// CODE -> year -> Lab
|
57
|
final Map<String, Map<Integer, List<Laboratory>>> labs = new HashMap<>();
|
58
|
|
59
|
// CODE -> names
|
60
|
final Map<String, String> names = new HashMap<>();
|
61
|
|
62
|
for (final Year y : res.getYears()) {
|
63
|
final Integer year = y.getYear();
|
64
|
for (final Person p : y.getPersons().getList()) {
|
65
|
names.putIfAbsent(p.getCode(), WordUtils.capitalize((p.getName() + " " + p.getSurname()).toLowerCase()));
|
66
|
labs.putIfAbsent(p.getCode(), new HashMap<>());
|
67
|
labs.get(p.getCode()).putIfAbsent(year, p.getLabs().getList());
|
68
|
}
|
69
|
}
|
70
|
|
71
|
final MongoCollection<DBObject> pubsCollection = store.getCollection();
|
72
|
|
73
|
for (final DBObject obj : pubsCollection.find()) {
|
74
|
final String id = obj.get("id").toString();
|
75
|
log.debug("**********************************************");
|
76
|
log.debug("Record " + id);
|
77
|
|
78
|
try {
|
79
|
final Document doc = (new SAXReader()).read(new StringReader(obj.get("body").toString()));
|
80
|
|
81
|
resetAffiliations(doc);
|
82
|
|
83
|
final int publicationYear = NumberUtils.toInt(doc.valueOf("//*[local-name()='date' and @dateType='Accepted']"), 0);
|
84
|
final List<Element> creatorNodes = doc.selectNodes("//*[local-name() = 'creator']");
|
85
|
for (final CnrAuthor cnrAuthor : listCnrAuthors(doc)) {
|
86
|
|
87
|
if (log.isDebugEnabled()) {
|
88
|
log.debug(" Surname : " + cnrAuthor.getSurname());
|
89
|
log.debug(" Name : " + cnrAuthor.getName());
|
90
|
log.debug(" Date : " + publicationYear);
|
91
|
}
|
92
|
final Element node = findNodeCreator(creatorNodes, cnrAuthor.getName(), cnrAuthor.getSurname());
|
93
|
if (node != null) {
|
94
|
|
95
|
log.debug("*** " + node.valueOf("./*[local-name() = 'creatorName']") + " -> " + names.get(cnrAuthor.getCode()));
|
96
|
|
97
|
if (labs.containsKey(cnrAuthor.getCode()) && labs.get(cnrAuthor.getCode()).containsKey(publicationYear)
|
98
|
&& names.containsKey(cnrAuthor.getCode())) {
|
99
|
if (log.isDebugEnabled()) {
|
100
|
log.debug(" Fullname (PIMPA): " + names.get(cnrAuthor.getCode()));
|
101
|
log.debug(" Laboratories (PIMPA): "
|
102
|
+ labs.get(cnrAuthor.getCode()).get(publicationYear).stream().map(Laboratory::getCode).collect(Collectors.joining()));
|
103
|
}
|
104
|
node.selectSingleNode("./*[local-name() = 'creatorName']").setText(names.get(cnrAuthor.getCode()));
|
105
|
for (final Laboratory l : labs.get(cnrAuthor.getCode()).get(publicationYear)) {
|
106
|
final Element affNode = node.addElement("affiliation");
|
107
|
affNode.setText("ISTI-CNR");
|
108
|
affNode.addAttribute("group", l.getDescription());
|
109
|
affNode.addAttribute("type", l.getType());
|
110
|
affNode.addAttribute("groupAcronym", l.getCode());
|
111
|
affNode.addAttribute("code", cnrAuthor.getCode());
|
112
|
}
|
113
|
} else {
|
114
|
log.warn("CNR User not found in pimpa: " + cnrAuthor);
|
115
|
final Element affNode = node.addElement("affiliation");
|
116
|
node.selectSingleNode("./*[local-name() = 'creatorName']").setText(cnrAuthor.getFullname());
|
117
|
affNode.addAttribute("code", cnrAuthor.getCode());
|
118
|
}
|
119
|
|
120
|
} else {
|
121
|
log.warn("----");
|
122
|
log.warn("Match non found for user : " + names.get(cnrAuthor.getCode()));
|
123
|
log.warn(" record : " + id);
|
124
|
log.warn(" list of authors : " + ((List<Element>) doc.selectNodes("//*[local-name() = 'creatorName']"))
|
125
|
.stream()
|
126
|
.map(Element::getText)
|
127
|
.collect(Collectors.joining(", ")));
|
128
|
}
|
129
|
|
130
|
log.debug(" ---");
|
131
|
|
132
|
pubsCollection.updateOne(new BasicDBObject("id", id), new BasicDBObject("$set", new BasicDBObject("body", doc.asXML())));
|
133
|
|
134
|
// For creators-cnrusers matching
|
135
|
// https://svn.driver.research-infrastructures.eu/driver/private/claudio.atzori/dnet-dedup-preprocess/trunk
|
136
|
// https://svn.driver.research-infrastructures.eu/driver/private/claudio.atzori/dnet-dedup-preprocess/trunk/src/main/java/eu/dnetlib/MatchParser.java
|
137
|
}
|
138
|
} catch (final DocumentException e) {
|
139
|
log.warn("Problem parsing a mdstore record");
|
140
|
}
|
141
|
}
|
142
|
}
|
143
|
|
144
|
private void resetAffiliations(final Document doc) {
|
145
|
for (final Object n : doc.selectNodes("//*[local-name() = 'creator']/*[local-name() = 'affiliation']")) {
|
146
|
((Node) n).detach();
|
147
|
}
|
148
|
}
|
149
|
|
150
|
protected List<CnrAuthor> listCnrAuthors(final Document doc) {
|
151
|
final List<CnrAuthor> res = new ArrayList<>();
|
152
|
for (final Object o : doc.selectNodes("//*[local-name()='person']/*[local-name()='infoId']")) {
|
153
|
final String s = ((Element) o).getText();
|
154
|
final Pattern pattern = Pattern.compile("info:cnr-pdr\\/author\\/(.+):(.+)\\/(.+)\\/(.+)");
|
155
|
final Matcher matcher = pattern.matcher(s);
|
156
|
if (matcher.find()) {
|
157
|
if (matcher.group(1).equals("matricola")) {
|
158
|
final CnrAuthor auth = new CnrAuthor();
|
159
|
auth.setCode(matcher.group(2));
|
160
|
auth.setSurname(matcher.group(3));
|
161
|
auth.setName(matcher.group(4));
|
162
|
res.add(auth);
|
163
|
}
|
164
|
}
|
165
|
}
|
166
|
return res;
|
167
|
}
|
168
|
|
169
|
protected Element findNodeCreator(final List<Element> nodes, final String name, final String surname) {
|
170
|
|
171
|
final Iterable<String> s1 = cleanValue(name + " " + surname);
|
172
|
|
173
|
final Optional<Element> res = nodes.stream()
|
174
|
.filter(n -> {
|
175
|
final Iterable<String> s2 = cleanValue(n.valueOf("./*[local-name() = 'creatorName']"));
|
176
|
return verifyMatch(s1, s2) || verifyMatch(s2, s1);
|
177
|
})
|
178
|
.findFirst();
|
179
|
|
180
|
if (res.isPresent()) {
|
181
|
nodes.remove(res.get());
|
182
|
return res.get();
|
183
|
}
|
184
|
|
185
|
return null;
|
186
|
}
|
187
|
|
188
|
protected boolean verifyMatch(final Iterable<String> s1, final Iterable<String> s2) {
|
189
|
|
190
|
final ArrayList<String> cs1 = new ArrayList<>();
|
191
|
final ArrayList<String> cs2 = Lists.newArrayList(s2);
|
192
|
final int start = cs2.size();
|
193
|
for (final String s : s1) {
|
194
|
if (!cs2.remove(s)) {
|
195
|
cs1.add(s);
|
196
|
}
|
197
|
}
|
198
|
for (final String s : cs1) {
|
199
|
cs2.remove(s.substring(0, 1));
|
200
|
}
|
201
|
return (start - cs2.size()) >= 2;
|
202
|
}
|
203
|
|
204
|
protected Iterable<String> cleanValue(final String s) {
|
205
|
return Splitter.on(" ").omitEmptyStrings().trimResults().split(s.toLowerCase()
|
206
|
.replaceAll("[àáâaäææãā]", "a")
|
207
|
.replaceAll("[èéêëēėę]", "e")
|
208
|
.replaceAll("[îïíīįì]", "i")
|
209
|
.replaceAll("[ôöòóœøōõ]", "o")
|
210
|
.replaceAll("[ûüùúū]", "u")
|
211
|
.replaceAll("[^a-z\\s]", ""));
|
212
|
}
|
213
|
|
214
|
private DataRange findDataRange(final int from, final int to) {
|
215
|
final JaxWsProxyFactoryBean factory = new JaxWsProxyFactoryBean();
|
216
|
factory.setServiceClass(PimpaService.class);
|
217
|
factory.setAddress(serviceUrl);
|
218
|
final PimpaService pimpaService = (PimpaService) factory.create();
|
219
|
return pimpaService.getDataRange(from, to);
|
220
|
}
|
221
|
|
222
|
/*
|
223
|
* private PersonLabs findLabForUser(final String user, final int year) { final JaxWsProxyFactoryBean factory = new
|
224
|
* JaxWsProxyFactoryBean(); factory.setServiceClass(PimpaService.class);
|
225
|
* factory.setAddress("http://pimpa.isti.cnr.it/PERSONALE/web-services/iop/iop.webservice.php"); final PimpaService pimpaService =
|
226
|
* (PimpaService) factory.create(); return pimpaService.getLabs(user, year); }
|
227
|
*/
|
228
|
}
|