Index: modules/cnr-data-utility-cleaner-service/trunk/pom.xml
===================================================================
--- modules/cnr-data-utility-cleaner-service/trunk/pom.xml (nonexistent)
+++ modules/cnr-data-utility-cleaner-service/trunk/pom.xml (revision 45159)
@@ -0,0 +1,71 @@
+
+
+
+ eu.dnetlib
+ dnet-parent
+ 1.0.0
+
+ 4.0.0
+ eu.dnetlib
+ cnr-data-utility-cleaner-service
+ jar
+ 3.0.1-SNAPSHOT
+
+ scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/cnr-data-utility-cleaner-service/trunk
+
+
+
+ org.mockito
+ mockito-core
+ 1.6
+ test
+
+
+ eu.dnetlib
+ cnr-test-utils
+ [1.0.0,2.0.0)
+ test
+
+
+ eu.dnetlib
+ cnr-rmi-api
+ [2.0.0,3.0.0)
+
+
+ eu.dnetlib
+ cnr-service-common
+ [2.1.2,3.0.0)
+
+
+ eu.dnetlib
+ cnr-resultset-service
+ [2.0.0,3.0.0)
+
+
+ com.google.guava
+ guava
+ ${google.guava.version}
+
+
+ org.codehaus.groovy
+ groovy-all
+ 2.1.6
+
+
+ eu.dnetlib
+ cnr-inspector
+ [1.0.0,2.0.0)
+
+
+ eu.dnetlib
+ cnr-data-utility-cleaner-rmi
+ [2.0.0,3.0.0)
+
+
+ junit
+ junit
+ ${junit.version}
+ test
+
+
+
Index: modules/cnr-data-utility-cleaner-service/tags/cnr-data-utility-cleaner-service-3.0.0/deploy.info
===================================================================
--- modules/cnr-data-utility-cleaner-service/tags/cnr-data-utility-cleaner-service-3.0.0/deploy.info (nonexistent)
+++ modules/cnr-data-utility-cleaner-service/tags/cnr-data-utility-cleaner-service-3.0.0/deploy.info (revision 45159)
@@ -0,0 +1 @@
+{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/cnr-data-utility-cleaner-service/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "cnr-data-utility-cleaner-service"}
\ No newline at end of file
Index: modules/cnr-data-utility-cleaner-service/trunk/src/test/java/eu/dnetlib/data/utility/cleaner/VocabularyRuleTest.java
===================================================================
--- modules/cnr-data-utility-cleaner-service/trunk/src/test/java/eu/dnetlib/data/utility/cleaner/VocabularyRuleTest.java (nonexistent)
+++ modules/cnr-data-utility-cleaner-service/trunk/src/test/java/eu/dnetlib/data/utility/cleaner/VocabularyRuleTest.java (revision 45159)
@@ -0,0 +1,128 @@
+package eu.dnetlib.data.utility.cleaner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.mockito.Matchers.anyString;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import java.io.StringReader;
+import java.util.List;
+
+import org.dom4j.Document;
+import org.dom4j.io.SAXReader;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.Mock;
+import org.mockito.runners.MockitoJUnit44Runner;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+@RunWith(MockitoJUnit44Runner.class)
+public class VocabularyRuleTest {
+
+ private static final String VOCABULARY_NAME_1 = "TEST VOCABULARY 1";
+ private static final String VOCABULARY_NAME_2 = "TEST VOCABULARY 2";
+ private static final List VOCABULARY = Lists.newArrayList("XXXX|-:-|AAAA", "YYYY|-:-|AAAA", "ZZZZ|-:-|AAAA");
+
+ /**
+ * Class Under Test
+ */
+ private VocabularyRule rule;
+
+ @Mock
+ private ISLookUpService lookup;
+
+ @Before
+ public void setUp() throws Exception {
+ when(lookup.quickSearchProfile(anyString())).thenReturn(VOCABULARY);
+
+ rule = new VocabularyRule(Sets.newHashSet(VOCABULARY_NAME_1, VOCABULARY_NAME_2), lookup);
+ }
+
+ @Test
+ public void testSetup() throws Exception {
+ final String xpath = "/a/b";
+ rule.setXpath(xpath);
+
+ execute("XXXX");
+
+ verify(lookup, times(2)).quickSearchProfile(anyString());
+ assertEquals(VOCABULARY.size(), rule.getVocabularyTerms().size());
+ }
+
+ @Test
+ public void testApplyXpathRule() throws Exception {
+ final String xpath = "/a/b";
+ rule.setXpath(xpath);
+ final Document doc = execute("XXXX");
+ assertEquals("AAAA", doc.valueOf(xpath));
+ assertNull(rule.verifyValue("AAAA"));
+ assertNotNull(rule.verifyValue("XXXX"));
+ }
+
+ @Test
+ public void testApplyXpathRule_2() throws Exception {
+ final String xpath = "/a/b";
+ rule.setXpath(xpath);
+ final Document doc = execute("XXXX");
+ assertEquals("AAAA", doc.valueOf(xpath));
+ assertNull(rule.verifyValue("AAAA"));
+ assertNotNull(rule.verifyValue("XXXX"));
+ }
+
+ @Test
+ public void testApplyXpathRule_3() throws Exception {
+ final String xpath = "/a/b";
+ rule.setXpath(xpath);
+ final Document doc = execute("XXXX");
+ assertEquals("AAAA", doc.valueOf(xpath));
+ }
+
+ @Test
+ public void testApplyXpathRule_attr() throws Exception {
+ final String xpath = "/a/b/@value";
+ rule.setXpath(xpath);
+ final Document doc = execute("");
+ assertEquals("AAAA", doc.valueOf(xpath));
+ assertNull(rule.verifyValue("AAAA"));
+ assertNotNull(rule.verifyValue("XXXX"));
+ }
+
+ @Test
+ public void testApplyXpathRule_with_spaces() throws Exception {
+ final String xpath = "/a/b";
+ rule.setXpath(xpath);
+ final Document doc = execute(" XXXX ");
+ assertEquals("AAAA", doc.valueOf(xpath));
+ assertNull(rule.verifyValue("AAAA"));
+ assertNotNull(rule.verifyValue(" XXXX "));
+ }
+
+ @Test
+ public void testApplyXpathRule_case() throws Exception {
+ final String xpath = "/a/b";
+ rule.setXpath(xpath);
+ final Document doc = execute("Xxxx");
+ assertEquals("AAAA", doc.valueOf(xpath));
+ assertNull(rule.verifyValue("AAAA"));
+ assertNotNull(rule.verifyValue("Xxxx"));
+ }
+
+ private Document execute(final String xml) throws Exception {
+ final SAXReader reader = new SAXReader();
+ final Document doc = reader.read(new StringReader(xml));
+ System.out.println("BEFORE:\n" + doc.asXML() + "\n");
+ rule.applyXpathRule(doc);
+ System.out.println("AFTER:\n" + doc.asXML() + "\n");
+ System.out.println("-----------------------------\n");
+ return doc;
+ }
+
+}
Index: modules/cnr-data-utility-cleaner-service/tags/cnr-data-utility-cleaner-service-2.0.0/src/main/java/eu/dnetlib/data/utility/cleaner/VocabularyRule.java
===================================================================
--- modules/cnr-data-utility-cleaner-service/tags/cnr-data-utility-cleaner-service-2.0.0/src/main/java/eu/dnetlib/data/utility/cleaner/VocabularyRule.java (nonexistent)
+++ modules/cnr-data-utility-cleaner-service/tags/cnr-data-utility-cleaner-service-2.0.0/src/main/java/eu/dnetlib/data/utility/cleaner/VocabularyRule.java (revision 45159)
@@ -0,0 +1,114 @@
+package eu.dnetlib.data.utility.cleaner;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import eu.dnetlib.enabling.tools.ServiceLocator;
+
+/**
+ * @author michele
+ *
+ * Vocabulary rules must be declared in a CleanerDS profile, for each vocabulary must be present the relative VocabularyDS profile:
+ *
+ *
+ */
+
+public class VocabularyRule extends XPATHCleaningRule {
+
+ private Set vocabularies;
+
+ private static final Log log = LogFactory.getLog(VocabularyRule.class); // NOPMD by marko on 11/24/08 5:02 PM
+
+ private Map synonyms = Maps.newHashMap();
+ private Set validTerms = Sets.newHashSet();
+
+ public VocabularyRule(final Set vocabularies, final ServiceLocator lookupLocator) throws CleanerException {
+ this.vocabularies = vocabularies;
+
+ loadSynonymsAndTerms(lookupLocator);
+ }
+
+ @Override
+ protected String calculateNewValue(final String oldValue) throws CleanerException {
+ log.debug("calculating new value for: " + oldValue);
+
+ if (synonyms.isEmpty()) {
+ log.warn("Vocabulary terms is void, vocabularies: " + this.vocabularies);
+ }
+
+ String newValue = null;
+
+ if (synonyms.containsKey(oldValue.toLowerCase())) {
+ newValue = synonyms.get(oldValue.toLowerCase());
+ }
+
+ if (newValue == null) {
+ log.debug("Synonym " + oldValue + " not found in vocabulary");
+ return oldValue;
+ }
+
+ return newValue;
+ }
+
+ private void loadSynonymsAndTerms(final ServiceLocator lookupLocator) throws CleanerException {
+
+ for (final String vocabulary : vocabularies) {
+ try {
+ final String query = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')"
+ + "//RESOURCE_PROFILE[.//VOCABULARY_NAME/@code='" + vocabulary + "']//TERM return "
+ + "( concat($x/@code,'|-:-|', $x/@code), concat($x/@english_name,'|-:-|', $x/@code), concat($x/@native_name,'|-:-|', $x/@code), "
+ + "for $y in $x//SYNONYM return concat($y/@term,'|-:-|', $x/@code) )";
+
+ for (final String s : lookupLocator.getService().quickSearchProfile(query)) {
+ log.debug("SYNONYM : " + s);
+ final String[] arr = s.split("\\|-:-\\|");
+ if ((arr[0] == null) || arr[0].isEmpty()) {
+ continue;
+ }
+ synonyms.put(arr[0].toLowerCase(), arr[1]);
+ validTerms.add(arr[1].toLowerCase());
+ }
+
+ log.info("VOCABULARY " + vocabulary.trim() + " - terms size " + synonyms.size());
+ } catch (final Exception e) {
+ throw new CleanerException("Error obtaining vocabulary " + vocabulary, e);
+ }
+ }
+
+ }
+
+ @Override
+ protected Map verifyValue(final String value) throws CleanerException {
+ if (synonyms.isEmpty()) {
+ log.warn("Vocabulary terms is void, vocabularies: " + this.vocabularies);
+ }
+
+ if (validTerms.contains(value.toLowerCase())) return null;
+
+ final Map error = new HashMap();
+ error.put("term", value);
+ error.put("vocabularies", this.vocabularies.toString().replaceAll("\\[", "").replaceAll("\\]", ""));
+ error.put("xpath", this.getXpath());
+ return error;
+ }
+
+ public Map getVocabularyTerms() {
+ return synonyms;
+ }
+
+ @Override
+ public String toString() {
+ return "VOCABULARIES: [" + Joiner.on(", ").join(vocabularies) + "]";
+ }
+
+}
Index: modules/cnr-data-utility-cleaner-service/trunk/src/test/java/eu/dnetlib/data/utility/cleaner/XMLCleaningRuleTest.java
===================================================================
--- modules/cnr-data-utility-cleaner-service/trunk/src/test/java/eu/dnetlib/data/utility/cleaner/XMLCleaningRuleTest.java (nonexistent)
+++ modules/cnr-data-utility-cleaner-service/trunk/src/test/java/eu/dnetlib/data/utility/cleaner/XMLCleaningRuleTest.java (revision 45159)
@@ -0,0 +1,72 @@
+package eu.dnetlib.data.utility.cleaner;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.runners.MockitoJUnit44Runner;
+
+import com.google.common.collect.Lists;
+
+import eu.dnetlib.data.utility.cleaner.rmi.CleanerException;
+
+@RunWith(MockitoJUnit44Runner.class)
+public class XMLCleaningRuleTest {
+
+ private static final String INPUT_VALID = "" + " " + " " + " HELLO" + " " + "";
+
+ private static final String INPUT_INVALID = "" + " " + " " + " GOOD BYE" + " " + "";
+
+ /**
+ * Class under test.
+ */
+ private CleaningRule xmlRule;
+ private XPATHCleaningRule mockXpathRule = new XPATHCleaningRule() {
+
+ @Override
+ protected Map verifyValue(final String value) throws CleanerException {
+ if (value.equals("CIAO")) { return null; }
+
+ Map err = new HashMap();
+ err.put("term", value);
+ return err;
+ }
+
+ @Override
+ protected String calculateNewValue(final String oldValue) throws CleanerException {
+ if (oldValue.equals("HELLO")) { return "CIAO"; }
+ return oldValue;
+ }
+ };;
+
+ @Before
+ public void setUp() throws Exception {
+ xmlRule = new CleaningRule();
+
+ mockXpathRule.setStrict(true);
+ mockXpathRule.setXpath("//a");
+
+ xmlRule.setXpathRules(Lists.newArrayList(mockXpathRule));
+ }
+
+ @Test
+ public void testEvaluate_valid() {
+ String s = xmlRule.evaluate(INPUT_VALID);
+ assertTrue(s.contains("CIAO"));
+ assertFalse(s.contains("invalid"));
+ }
+
+ @Test
+ public void testEvaluate_invalid() {
+ String s = xmlRule.evaluate(INPUT_INVALID);
+ System.out.println(s);
+ assertFalse(s.contains("CIAO"));
+ assertTrue(s.contains("invalid"));
+ }
+
+}
Index: modules/cnr-data-utility-cleaner-service/trunk/src/test/java/eu/dnetlib/data/utility/cleaner/GroovyRuleTest.java
===================================================================
--- modules/cnr-data-utility-cleaner-service/trunk/src/test/java/eu/dnetlib/data/utility/cleaner/GroovyRuleTest.java (nonexistent)
+++ modules/cnr-data-utility-cleaner-service/trunk/src/test/java/eu/dnetlib/data/utility/cleaner/GroovyRuleTest.java (revision 45159)
@@ -0,0 +1,135 @@
+package eu.dnetlib.data.utility.cleaner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.StringReader;
+import java.util.List;
+
+import org.dom4j.Document;
+import org.dom4j.Node;
+import org.dom4j.io.SAXReader;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.runners.MockitoJUnit44Runner;
+
+@RunWith(MockitoJUnit44Runner.class)
+public class GroovyRuleTest {
+
+ @Test
+ public void testApplyXpathRule_simple_constant() throws Exception {
+ final GroovyRule rule = new GroovyRule("'YYYY'");
+
+ final String xpath = "/a/b";
+
+ rule.setXpath(xpath);
+
+ final Document doc = execute(rule, "XXXX");
+
+ assertEquals("YYYY", doc.valueOf(xpath));
+ }
+
+ @Test
+ public void testApplyXpathRule_simple_regex() throws Exception {
+ final GroovyRule rule = new GroovyRule("(input =~ /X/).replaceAll('Y')");
+
+ final String xpath = "/a/b";
+
+ rule.setXpath(xpath);
+
+ final Document doc = execute(rule, "aXaXa");
+
+ assertEquals("aYaYa", doc.valueOf(xpath));
+ }
+
+ @Test
+ public void testApplyXpathRule_simple_upper() throws Exception {
+ final GroovyRule rule = new GroovyRule("input.toUpperCase()");
+
+ final String xpath = "/a/b";
+
+ rule.setXpath(xpath);
+
+ final Document doc = execute(rule, "xyz");
+
+ assertEquals("XYZ", doc.valueOf(xpath));
+ }
+
+ @Test
+ public void testApplyXpathRule_multi() throws Exception {
+ final GroovyRule rule = new GroovyRule("'Y'");
+
+ final String xpath = "/a/b";
+
+ rule.setXpath(xpath);
+
+ final Document doc = execute(rule, "XXX");
+
+ List> list = doc.selectNodes(xpath);
+
+ assertEquals(3, list.size());
+ for (Object o : list) {
+ assertEquals("Y", ((Node) o).getText());
+ }
+
+ }
+
+ @Test
+ public void testApplyXpathRule_singleAttr() throws Exception {
+ final GroovyRule rule = new GroovyRule("'BBBB'");
+
+ final String xpath = "/a/b/@value";
+
+ rule.setXpath(xpath);
+
+ final Document doc = execute(rule, "XXXX");
+
+ assertEquals("BBBB", doc.valueOf(xpath));
+ assertEquals("XXXX", doc.valueOf("/a/b"));
+ }
+
+ @Test
+ public void testApplyXpathRule_multiAttr() throws Exception {
+ final GroovyRule rule = new GroovyRule("'B'");
+
+ final String xpath = "/a/b/@value";
+
+ rule.setXpath(xpath);
+
+ final Document doc = execute(rule, "");
+
+ final List> list = doc.selectNodes(xpath);
+
+ assertEquals(3, list.size());
+ for (Object o : list) {
+ assertEquals("B", ((Node) o).getText());
+ }
+ }
+
+ @Test
+ public void testApplyXpathRule_complex() throws Exception {
+ final GroovyRule rule = new GroovyRule("'B'");
+
+ final String xpath = "/a/b";
+
+ rule.setXpath(xpath);
+
+ final Document doc = execute(rule, "XC");
+
+ assertTrue(doc.valueOf(xpath).contains("B"));
+ assertEquals("C", doc.valueOf("/a/b/c"));
+ }
+
+ private Document execute(final GroovyRule rule, final String xml) throws Exception {
+
+ final SAXReader reader = new SAXReader();
+ final Document doc = reader.read(new StringReader(xml));
+
+ System.out.println("BEFORE:\n" + doc.asXML() + "\n");
+ rule.applyXpathRule(doc);
+ System.out.println("AFTER:\n" + doc.asXML() + "\n");
+
+ System.out.println("-----------------------------\n");
+ return doc;
+ }
+}
Index: modules/cnr-data-utility-cleaner-service/tags/cnr-data-utility-cleaner-service-3.0.0/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRule.java
===================================================================
--- modules/cnr-data-utility-cleaner-service/tags/cnr-data-utility-cleaner-service-3.0.0/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRule.java (nonexistent)
+++ modules/cnr-data-utility-cleaner-service/tags/cnr-data-utility-cleaner-service-3.0.0/src/main/java/eu/dnetlib/data/utility/cleaner/CleaningRule.java (revision 45159)
@@ -0,0 +1,67 @@
+package eu.dnetlib.data.utility.cleaner;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Document;
+import org.dom4j.Element;
+import org.dom4j.Namespace;
+import org.dom4j.QName;
+import org.dom4j.io.SAXReader;
+import org.springframework.beans.factory.annotation.Required;
+
+import eu.dnetlib.miscutils.functional.UnaryFunction;
+
+public class CleaningRule implements UnaryFunction {
+
+ private static final Log log = LogFactory.getLog(CleaningRule.class); // NOPMD by marko on 11/24/08 5:02 PM
+
+ private List xpathRules = new ArrayList();
+
+ @Override
+ public String evaluate(final String text) {
+
+ try {
+ final List