Project

General

Profile

1
package eu.dnetlib.data.mdstore.plugins;
2

    
3
import java.io.StringReader;
4
import java.util.ArrayList;
5
import java.util.HashMap;
6
import java.util.List;
7
import java.util.Map;
8
import java.util.Optional;
9
import java.util.regex.Matcher;
10
import java.util.regex.Pattern;
11
import java.util.stream.Collectors;
12

    
13
import org.apache.commons.lang3.math.NumberUtils;
14
import org.apache.commons.lang3.text.WordUtils;
15
import org.apache.commons.logging.Log;
16
import org.apache.commons.logging.LogFactory;
17
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
18
import org.dom4j.Document;
19
import org.dom4j.DocumentException;
20
import org.dom4j.Element;
21
import org.dom4j.Node;
22
import org.dom4j.io.SAXReader;
23
import org.springframework.beans.factory.annotation.Value;
24

    
25
import com.google.common.base.Splitter;
26
import com.google.common.collect.Lists;
27
import com.mongodb.BasicDBObject;
28
import com.mongodb.DBObject;
29
import com.mongodb.client.MongoCollection;
30

    
31
import eu.dnetlib.clients.pimpa.DataRange;
32
import eu.dnetlib.clients.pimpa.Laboratory;
33
import eu.dnetlib.clients.pimpa.Person;
34
import eu.dnetlib.clients.pimpa.PimpaService;
35
import eu.dnetlib.clients.pimpa.Year;
36
import eu.dnetlib.data.mdstore.modular.mongodb.MongoMDStore;
37
import eu.dnetlib.data.mdstore.plugins.objects.CnrAuthor;
38
import eu.dnetlib.rmi.data.MDStoreServiceException;
39

    
40
public class EnrichLabsPlugin extends AbstractIstiMDStorePlugin {
41

    
42
	private static final Log log = LogFactory.getLog(EnrichLabsPlugin.class);
43

    
44
	@Value("${plugin.enrich.labs.pimpa.url}")
45
	private String serviceUrl;
46

    
47
	@SuppressWarnings("unchecked")
48
	@Override
49
	public void process(final MongoMDStore store, final Map<String, String> params) throws MDStoreServiceException {
50
		log.warn("********************************************");
51
		log.warn("* ENRICH LABS");
52
		log.warn("********************************************");
53

    
54
		final DataRange res = findDataRange(1980, 2017);
55

    
56
		// CODE -> year -> Lab
57
		final Map<String, Map<Integer, List<Laboratory>>> labs = new HashMap<>();
58

    
59
		// CODE -> names
60
		final Map<String, String> names = new HashMap<>();
61

    
62
		for (final Year y : res.getYears()) {
63
			final Integer year = y.getYear();
64
			for (final Person p : y.getPersons().getList()) {
65
				names.putIfAbsent(p.getCode(), WordUtils.capitalize((p.getName() + " " + p.getSurname()).toLowerCase()));
66
				labs.putIfAbsent(p.getCode(), new HashMap<>());
67
				labs.get(p.getCode()).putIfAbsent(year, p.getLabs().getList());
68
			}
69
		}
70

    
71
		final MongoCollection<DBObject> pubsCollection = store.getCollection();
72

    
73
		for (final DBObject obj : pubsCollection.find()) {
74
			final String id = obj.get("id").toString();
75
			log.debug("**********************************************");
76
			log.debug("Record " + id);
77

    
78
			try {
79
				final Document doc = (new SAXReader()).read(new StringReader(obj.get("body").toString()));
80

    
81
				resetAffiliations(doc);
82

    
83
				final int publicationYear = NumberUtils.toInt(doc.valueOf("//*[local-name()='date' and @dateType='Accepted']"), 0);
84
				final List<Element> creatorNodes = doc.selectNodes("//*[local-name() = 'creator']");
85
				for (final CnrAuthor cnrAuthor : listCnrAuthors(doc)) {
86

    
87
					if (log.isDebugEnabled()) {
88
						log.debug("    Surname : " + cnrAuthor.getSurname());
89
						log.debug("    Name    : " + cnrAuthor.getName());
90
						log.debug("    Date    : " + publicationYear);
91
					}
92
					final Element node = findNodeCreator(creatorNodes, cnrAuthor.getName(), cnrAuthor.getSurname());
93
					if (node != null) {
94

    
95
						log.debug("*** " + node.valueOf("./*[local-name() = 'creatorName']") + " -> " + names.get(cnrAuthor.getCode()));
96

    
97
						if (labs.containsKey(cnrAuthor.getCode()) && labs.get(cnrAuthor.getCode()).containsKey(publicationYear)
98
								&& names.containsKey(cnrAuthor.getCode())) {
99
							if (log.isDebugEnabled()) {
100
								log.debug("    Fullname (PIMPA): " + names.get(cnrAuthor.getCode()));
101
								log.debug("    Laboratories (PIMPA): "
102
										+ labs.get(cnrAuthor.getCode()).get(publicationYear).stream().map(Laboratory::getCode).collect(Collectors.joining()));
103
							}
104
							node.selectSingleNode("./*[local-name() = 'creatorName']").setText(names.get(cnrAuthor.getCode()));
105
							for (final Laboratory l : labs.get(cnrAuthor.getCode()).get(publicationYear)) {
106
								final Element affNode = node.addElement("affiliation");
107
								affNode.setText("ISTI-CNR");
108
								affNode.addAttribute("group", l.getDescription());
109
								affNode.addAttribute("type", l.getType());
110
								affNode.addAttribute("groupAcronym", l.getCode());
111
								affNode.addAttribute("code", cnrAuthor.getCode());
112
							}
113
						} else {
114
							log.warn("CNR User not found in pimpa: " + cnrAuthor);
115
							final Element affNode = node.addElement("affiliation");
116
							node.selectSingleNode("./*[local-name() = 'creatorName']").setText(cnrAuthor.getFullname());
117
							affNode.addAttribute("code", cnrAuthor.getCode());
118
						}
119

    
120
					} else {
121
						log.warn("----");
122
						log.warn("Match non found for user : " + names.get(cnrAuthor.getCode()));
123
						log.warn("                  record : " + id);
124
						log.warn("         list of authors : " + ((List<Element>) doc.selectNodes("//*[local-name() = 'creatorName']"))
125
								.stream()
126
								.map(Element::getText)
127
								.collect(Collectors.joining(", ")));
128
					}
129

    
130
					log.debug("    ---");
131

    
132
					pubsCollection.updateOne(new BasicDBObject("id", id), new BasicDBObject("$set", new BasicDBObject("body", doc.asXML())));
133

    
134
					// For creators-cnrusers matching
135
					// https://svn.driver.research-infrastructures.eu/driver/private/claudio.atzori/dnet-dedup-preprocess/trunk
136
					// https://svn.driver.research-infrastructures.eu/driver/private/claudio.atzori/dnet-dedup-preprocess/trunk/src/main/java/eu/dnetlib/MatchParser.java
137
				}
138
			} catch (final DocumentException e) {
139
				log.warn("Problem parsing a mdstore record");
140
			}
141
		}
142
	}
143

    
144
	private void resetAffiliations(final Document doc) {
145
		for (final Object n : doc.selectNodes("//*[local-name() = 'creator']/*[local-name() = 'affiliation']")) {
146
			((Node) n).detach();
147
		}
148
	}
149

    
150
	protected List<CnrAuthor> listCnrAuthors(final Document doc) {
151
		final List<CnrAuthor> res = new ArrayList<>();
152
		for (final Object o : doc.selectNodes("//*[local-name()='person']/*[local-name()='infoId']")) {
153
			final String s = ((Element) o).getText();
154
			final Pattern pattern = Pattern.compile("info:cnr-pdr\\/author\\/(.+):(.+)\\/(.+)\\/(.+)");
155
			final Matcher matcher = pattern.matcher(s);
156
			if (matcher.find()) {
157
				if (matcher.group(1).equals("matricola")) {
158
					final CnrAuthor auth = new CnrAuthor();
159
					auth.setCode(matcher.group(2));
160
					auth.setSurname(matcher.group(3));
161
					auth.setName(matcher.group(4));
162
					res.add(auth);
163
				}
164
			}
165
		}
166
		return res;
167
	}
168

    
169
	protected Element findNodeCreator(final List<Element> nodes, final String name, final String surname) {
170

    
171
		final Iterable<String> s1 = cleanValue(name + " " + surname);
172

    
173
		final Optional<Element> res = nodes.stream()
174
				.filter(n -> {
175
					final Iterable<String> s2 = cleanValue(n.valueOf("./*[local-name() = 'creatorName']"));
176
					return verifyMatch(s1, s2) || verifyMatch(s2, s1);
177
				})
178
				.findFirst();
179

    
180
		if (res.isPresent()) {
181
			nodes.remove(res.get());
182
			return res.get();
183
		}
184

    
185
		return null;
186
	}
187

    
188
	protected boolean verifyMatch(final Iterable<String> s1, final Iterable<String> s2) {
189

    
190
		final ArrayList<String> cs1 = new ArrayList<>();
191
		final ArrayList<String> cs2 = Lists.newArrayList(s2);
192
		final int start = cs2.size();
193
		for (final String s : s1) {
194
			if (!cs2.remove(s)) {
195
				cs1.add(s);
196
			}
197
		}
198
		for (final String s : cs1) {
199
			cs2.remove(s.substring(0, 1));
200
		}
201
		return (start - cs2.size()) >= 2;
202
	}
203

    
204
	protected Iterable<String> cleanValue(final String s) {
205
		return Splitter.on(" ").omitEmptyStrings().trimResults().split(s.toLowerCase()
206
				.replaceAll("[àáâaäææãā]", "a")
207
				.replaceAll("[èéêëēėę]", "e")
208
				.replaceAll("[îïíīįì]", "i")
209
				.replaceAll("[ôöòóœøōõ]", "o")
210
				.replaceAll("[ûüùúū]", "u")
211
				.replaceAll("[^a-z\\s]", ""));
212
	}
213

    
214
	private DataRange findDataRange(final int from, final int to) {
215
		final JaxWsProxyFactoryBean factory = new JaxWsProxyFactoryBean();
216
		factory.setServiceClass(PimpaService.class);
217
		factory.setAddress(serviceUrl);
218
		final PimpaService pimpaService = (PimpaService) factory.create();
219
		return pimpaService.getDataRange(from, to);
220
	}
221

    
222
	/*
223
	 * private PersonLabs findLabForUser(final String user, final int year) { final JaxWsProxyFactoryBean factory = new
224
	 * JaxWsProxyFactoryBean(); factory.setServiceClass(PimpaService.class);
225
	 * factory.setAddress("http://pimpa.isti.cnr.it/PERSONALE/web-services/iop/iop.webservice.php"); final PimpaService pimpaService =
226
	 * (PimpaService) factory.create(); return pimpaService.getLabs(user, year); }
227
	 */
228
}
(3-3/7)