Revision 55840
Added by Claudio Atzori almost 5 years ago
modules/dnet-openaire-blacklist/trunk/src/main/java/eu/dnetlib/openaire/blacklist/OpenaireIdResolver.java | ||
---|---|---|
3 | 3 |
import java.io.StringReader; |
4 | 4 |
import java.util.List; |
5 | 5 |
|
6 |
import com.google.common.base.Function; |
|
7 |
import com.google.common.collect.Iterables; |
|
8 |
import com.google.common.collect.Lists; |
|
9 |
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; |
|
10 |
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; |
|
11 |
import eu.dnetlib.enabling.locators.UniqueServiceLocator; |
|
12 |
|
|
13 |
import eu.dnetlib.functionality.index.client.solr.SolrIndexClient; |
|
14 |
import eu.dnetlib.functionality.index.client.solr.SolrIndexClientFactory; |
|
6 | 15 |
import org.apache.commons.lang.StringUtils; |
7 | 16 |
import org.apache.commons.logging.Log; |
8 | 17 |
import org.apache.commons.logging.LogFactory; |
9 |
import org.apache.solr.client.solrj.SolrQuery; |
|
10 |
import org.apache.solr.client.solrj.impl.CloudSolrServer; |
|
11 | 18 |
import org.apache.solr.client.solrj.response.QueryResponse; |
12 | 19 |
import org.apache.solr.common.SolrDocument; |
13 | 20 |
import org.apache.solr.common.SolrDocumentList; |
... | ... | |
17 | 24 |
import org.dom4j.io.SAXReader; |
18 | 25 |
import org.springframework.beans.factory.annotation.Autowired; |
19 | 26 |
|
20 |
import com.google.common.base.Function; |
|
21 |
import com.google.common.collect.Iterables; |
|
22 |
import com.google.common.collect.Lists; |
|
23 |
|
|
24 |
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; |
|
25 |
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; |
|
26 |
import eu.dnetlib.enabling.locators.UniqueServiceLocator; |
|
27 |
|
|
28 | 27 |
/** |
29 | 28 |
* The goal of this task is to return the original identifiers of objects merged in a representative object by deduplication. Created by |
30 | 29 |
* alessia on 09/02/16. |
31 | 30 |
*/ |
32 | 31 |
public class OpenaireIdResolver { |
33 | 32 |
|
34 |
public final static String SOLR_COLLECTION_POSTFIX = "-index-openaire"; |
|
35 | 33 |
private static final Log log = LogFactory.getLog(OpenaireIdResolver.class); |
36 | 34 |
private final static String RESULT_FIELD = "__result"; |
37 | 35 |
private final static String XPATH_TO_MERGED = "//*[local-name()='entity']/*//children/result/@objidentifier"; |
38 | 36 |
private final SAXReader saxReader = new SAXReader(); |
39 | 37 |
@Autowired |
40 | 38 |
private UniqueServiceLocator serviceLocator; |
39 |
@Autowired |
|
40 |
private SolrIndexClientFactory indexClientFactory; |
|
41 | 41 |
|
42 | 42 |
public List<String> resolveIdentifier(final String id) { |
43 | 43 |
if (StringUtils.isBlank(id)) return Lists.newArrayList(); |
... | ... | |
47 | 47 |
} |
48 | 48 |
|
49 | 49 |
protected List<String> findOriginalIds(final String id) { |
50 |
CloudSolrServer solrCore = null; |
|
51 |
final SolrQuery q = new SolrQuery("objidentifier:\"" + id + "\""); |
|
52 |
QueryResponse response = null; |
|
53 |
try { |
|
54 |
solrCore = new CloudSolrServer(getIndexEndpoint()); |
|
55 |
solrCore.setDefaultCollection(getPublicIndexCollection() + SOLR_COLLECTION_POSTFIX); |
|
56 |
response = solrCore.query(q); |
|
50 |
try(final SolrIndexClient client = (SolrIndexClient) indexClientFactory.getClient(getPublicIndexCollection())) { |
|
51 |
|
|
52 |
final String query = String.format("objidentifier:\"%s\"", id); |
|
53 |
final QueryResponse response = client.query(query, 1); |
|
57 | 54 |
final SolrDocumentList results = response.getResults(); |
58 | 55 |
if (results.isEmpty()) { |
59 |
log.debug("Query " + q + " returned 0 documents"); |
|
56 |
log.debug("Query " + query + " returned 0 documents");
|
|
60 | 57 |
return Lists.newArrayList(); |
61 | 58 |
} |
62 | 59 |
// my results contain the document with the given identifier |
... | ... | |
65 | 62 |
} catch (final Exception e) { |
66 | 63 |
log.error("Can't get original ids for " + id + "\n ", e); |
67 | 64 |
throw new RuntimeException("Can't get original ids for " + id + "\n " + e); |
68 |
} finally { |
|
69 |
solrCore.shutdown(); |
|
70 | 65 |
} |
71 | 66 |
} |
72 | 67 |
|
... | ... | |
74 | 69 |
protected List<String> extractMergedIdentifiers(final SolrDocument doc) throws DocumentException { |
75 | 70 |
final String xmlRecord = (String) doc.getFirstValue(RESULT_FIELD); |
76 | 71 |
final Document xmlDoc = this.saxReader.read(new StringReader(xmlRecord)); |
77 |
return Lists.newArrayList(Iterables.transform(xmlDoc.selectNodes(XPATH_TO_MERGED), new Function<Attribute, String>() { |
|
78 |
|
|
79 |
@Override |
|
80 |
public String apply(final Attribute a) { |
|
81 |
return a.getStringValue(); |
|
82 |
} |
|
83 |
})); |
|
72 |
return Lists.newArrayList(Iterables.transform(xmlDoc.selectNodes(XPATH_TO_MERGED), (Function<Attribute, String>) a -> a.getStringValue())); |
|
84 | 73 |
} |
85 | 74 |
|
86 | 75 |
protected String getIndexEndpoint() throws ISLookUpException { |
modules/dnet-openaire-blacklist/trunk/pom.xml | ||
---|---|---|
1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
2 | 2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet45-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
<relativePath /> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-openaire-blacklist</artifactId> |
|
12 |
<packaging>jar</packaging> |
|
13 |
<version>1.1.4-SNAPSHOT</version> |
|
14 |
<scm> |
|
15 |
<developerConnection> |
|
16 |
scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-openaire-blacklist/trunk |
|
17 |
</developerConnection> |
|
18 |
</scm> |
|
19 |
<dependencies> |
|
20 |
<dependency> |
|
21 |
<groupId>eu.dnetlib</groupId> |
|
22 |
<artifactId>cnr-resultset-client</artifactId> |
|
23 |
<version>[2.0.0,3.0.0)</version> |
|
24 |
</dependency> |
|
25 |
<dependency> |
|
26 |
<groupId>eu.dnetlib</groupId> |
|
27 |
<artifactId>cnr-enabling-database-api</artifactId> |
|
28 |
<version>[2.0.0,3.0.0)</version> |
|
29 |
</dependency> |
|
30 |
<dependency> |
|
31 |
<groupId>eu.dnetlib</groupId> |
|
32 |
<artifactId>dnet-modular-ui</artifactId> |
|
33 |
<version>[3.0.0,4.0.0)</version> |
|
34 |
</dependency> |
|
35 |
<dependency> |
|
36 |
<groupId>eu.dnetlib</groupId> |
|
37 |
<artifactId>dnet-msro-service</artifactId> |
|
38 |
<version>[3.0.0,4.0.0)</version> |
|
39 |
</dependency> |
|
40 |
<dependency> |
|
41 |
<groupId>eu.dnetlib</groupId> |
|
42 |
<artifactId>dnet-hadoop-service-rmi</artifactId> |
|
43 |
<version>[1.0.0,2.0.0)</version> |
|
44 |
</dependency> |
|
45 |
<dependency> |
|
46 |
<groupId>eu.dnetlib</groupId> |
|
47 |
<artifactId>dnet-openaireplus-mapping-utils</artifactId> |
|
48 |
<version>[6.0.0,7.0.0)</version> |
|
49 |
</dependency> |
|
50 |
<dependency> |
|
51 |
<groupId>javax.servlet</groupId> |
|
52 |
<artifactId>javax.servlet-api</artifactId> |
|
53 |
<version>${javax.servlet.version}</version> |
|
54 |
<scope>provided</scope> |
|
55 |
</dependency> |
|
56 |
<dependency> |
|
57 |
<groupId>junit</groupId> |
|
58 |
<artifactId>junit</artifactId> |
|
59 |
<version>${junit.version}</version> |
|
60 |
<scope>test</scope> |
|
61 |
</dependency> |
|
62 |
<dependency> |
|
63 |
<groupId>org.mockito</groupId> |
|
64 |
<artifactId>mockito-core</artifactId> |
|
65 |
<version>1.9.5</version> |
|
66 |
</dependency> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet45-parent</artifactId> |
|
6 |
<version>1.0.0-SNAPSHOT</version> |
|
7 |
<relativePath/> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-openaire-blacklist</artifactId> |
|
12 |
<packaging>jar</packaging> |
|
13 |
<version>1.2.0-SNAPSHOT</version> |
|
14 |
<scm> |
|
15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-openaire-blacklist/trunk</developerConnection> |
|
16 |
</scm> |
|
17 |
<dependencies> |
|
18 |
<dependency> |
|
19 |
<groupId>eu.dnetlib</groupId> |
|
20 |
<artifactId>cnr-resultset-client</artifactId> |
|
21 |
<version>[2.0.0,3.0.0)</version> |
|
22 |
</dependency> |
|
23 |
<dependency> |
|
24 |
<groupId>eu.dnetlib</groupId> |
|
25 |
<artifactId>cnr-enabling-database-api</artifactId> |
|
26 |
<version>[2.0.0,3.0.0)</version> |
|
27 |
</dependency> |
|
28 |
<dependency> |
|
29 |
<groupId>eu.dnetlib</groupId> |
|
30 |
<artifactId>dnet-modular-ui</artifactId> |
|
31 |
<version>[3.0.0,4.0.0)</version> |
|
32 |
</dependency> |
|
33 |
<dependency> |
|
34 |
<groupId>eu.dnetlib</groupId> |
|
35 |
<artifactId>dnet-msro-service</artifactId> |
|
36 |
<version>[3.0.0,4.0.0)</version> |
|
37 |
</dependency> |
|
38 |
<dependency> |
|
39 |
<groupId>eu.dnetlib</groupId> |
|
40 |
<artifactId>dnet-hadoop-service-rmi</artifactId> |
|
41 |
<version>[1.0.0,2.0.0)</version> |
|
42 |
</dependency> |
|
43 |
<dependency> |
|
44 |
<groupId>eu.dnetlib</groupId> |
|
45 |
<artifactId>dnet-openaireplus-mapping-utils</artifactId> |
|
46 |
<version>[6.3.24,7.0.0)</version> |
|
47 |
</dependency> |
|
48 |
<dependency> |
|
49 |
<groupId>eu.dnetlib</groupId> |
|
50 |
<artifactId>dnet-index-client</artifactId> |
|
51 |
<version>[2.3.4,3.0.0)</version> |
|
52 |
</dependency> |
|
53 |
<dependency> |
|
54 |
<groupId>javax.servlet</groupId> |
|
55 |
<artifactId>javax.servlet-api</artifactId> |
|
56 |
<version>${javax.servlet.version}</version> |
|
57 |
<scope>provided</scope> |
|
58 |
</dependency> |
|
59 |
<dependency> |
|
60 |
<groupId>junit</groupId> |
|
61 |
<artifactId>junit</artifactId> |
|
62 |
<version>${junit.version}</version> |
|
63 |
<scope>test</scope> |
|
64 |
</dependency> |
|
65 |
<dependency> |
|
66 |
<groupId>org.mockito</groupId> |
|
67 |
<artifactId>mockito-core</artifactId> |
|
68 |
<version>1.9.5</version> |
|
69 |
</dependency> |
|
67 | 70 |
|
68 |
</dependencies>
|
|
71 |
</dependencies>
|
|
69 | 72 |
</project> |
Also available in: Unified diff
adjusted dependencies