Revision 63084
Added by Claudio Atzori 9 months ago
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/pom.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet45-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
<relativePath /> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-openaireplus-mapping-utils</artifactId> |
|
12 |
<packaging>jar</packaging> |
|
13 |
<version>7.0.1</version> |
|
14 |
<scm> |
|
15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1</developerConnection> |
|
16 |
</scm> |
|
17 |
|
|
18 |
<properties> |
|
19 |
<skipITs>true</skipITs> |
|
20 |
</properties> |
|
21 |
<build> |
|
22 |
<plugins> |
|
23 |
<plugin> |
|
24 |
<groupId>org.apache.maven.plugins</groupId> |
|
25 |
<artifactId>maven-failsafe-plugin</artifactId> |
|
26 |
<version>2.19.1</version> |
|
27 |
<executions> |
|
28 |
<execution> |
|
29 |
<id>integration-test</id> |
|
30 |
<goals> |
|
31 |
<goal>integration-test</goal> |
|
32 |
</goals> |
|
33 |
</execution> |
|
34 |
<execution> |
|
35 |
<id>verify</id> |
|
36 |
<goals> |
|
37 |
<goal>verify</goal> |
|
38 |
</goals> |
|
39 |
</execution> |
|
40 |
</executions> |
|
41 |
<configuration> |
|
42 |
<skipITs>${skipITs}</skipITs> |
|
43 |
</configuration> |
|
44 |
</plugin> |
|
45 |
</plugins> |
|
46 |
</build> |
|
47 |
|
|
48 |
<dependencies> |
|
49 |
<dependency> |
|
50 |
<groupId>com.google.guava</groupId> |
|
51 |
<artifactId>guava</artifactId> |
|
52 |
<version>${google.guava.version}</version> |
|
53 |
</dependency> |
|
54 |
<dependency> |
|
55 |
<groupId>junit</groupId> |
|
56 |
<artifactId>junit</artifactId> |
|
57 |
<version>${junit.version}</version> |
|
58 |
<scope>test</scope> |
|
59 |
</dependency> |
|
60 |
<dependency> |
|
61 |
<groupId>com.ximpleware</groupId> |
|
62 |
<artifactId>vtd-xml</artifactId> |
|
63 |
<version>[2.12, 3.0.0)</version> |
|
64 |
</dependency> |
|
65 |
<dependency> |
|
66 |
<groupId>commons-codec</groupId> |
|
67 |
<artifactId>commons-codec</artifactId> |
|
68 |
<version>${commons.codec.version}</version> |
|
69 |
</dependency> |
|
70 |
<dependency> |
|
71 |
<groupId>dom4j</groupId> |
|
72 |
<artifactId>dom4j</artifactId> |
|
73 |
<version>${dom4j.version}</version> |
|
74 |
<exclusions> |
|
75 |
<exclusion> |
|
76 |
<artifactId>xml-apis</artifactId> |
|
77 |
<groupId>xml-apis</groupId> |
|
78 |
</exclusion> |
|
79 |
</exclusions> |
|
80 |
</dependency> |
|
81 |
<dependency> |
|
82 |
<groupId>net.sf.supercsv</groupId> |
|
83 |
<artifactId>super-csv</artifactId> |
|
84 |
<version>2.4.0</version> |
|
85 |
</dependency> |
|
86 |
<dependency> |
|
87 |
<groupId>eu.dnetlib</groupId> |
|
88 |
<artifactId>dnet-openaire-data-protos</artifactId> |
|
89 |
<version>[3.9.8]</version> |
|
90 |
</dependency> |
|
91 |
<dependency> |
|
92 |
<groupId>eu.dnetlib</groupId> |
|
93 |
<artifactId>dnet-pace-core</artifactId> |
|
94 |
<version>[3.0.0,4.0.0)</version> |
|
95 |
</dependency> |
|
96 |
<dependency> |
|
97 |
<groupId>eu.dnetlib</groupId> |
|
98 |
<artifactId>cnr-misc-utils</artifactId> |
|
99 |
<version>[1.0.0,2.0.0)</version> |
|
100 |
</dependency> |
|
101 |
<dependency> |
|
102 |
<groupId>eu.dnetlib</groupId> |
|
103 |
<artifactId>dnet-hadoop-commons</artifactId> |
|
104 |
<version>[2.0.0,3.0.0)</version> |
|
105 |
</dependency> |
|
106 |
<dependency> |
|
107 |
<groupId>eu.dnetlib</groupId> |
|
108 |
<artifactId>dnet-index-solr-common</artifactId> |
|
109 |
<version>[3.0.1,4.0.0)</version> |
|
110 |
</dependency> |
|
111 |
<dependency> |
|
112 |
<groupId>com.googlecode.protobuf-java-format</groupId> |
|
113 |
<artifactId>protobuf-java-format</artifactId> |
|
114 |
<version>1.2</version> |
|
115 |
</dependency> |
|
116 |
<dependency> |
|
117 |
<groupId>org.apache.commons</groupId> |
|
118 |
<artifactId>commons-lang3</artifactId> |
|
119 |
<version>3.5</version> |
|
120 |
</dependency> |
|
121 |
|
|
122 |
<!-- test deps --> |
|
123 |
<dependency> |
|
124 |
<groupId>eu.dnetlib</groupId> |
|
125 |
<artifactId>dnet-openaireplus-profiles</artifactId> |
|
126 |
<version>[1.0.0,2.0.0)</version> |
|
127 |
<scope>test</scope> |
|
128 |
</dependency> |
|
129 |
<dependency> |
|
130 |
<groupId>org.mongodb</groupId> |
|
131 |
<artifactId>mongo-java-driver</artifactId> |
|
132 |
<version>${mongodb.driver.version}</version> |
|
133 |
<scope>test</scope> |
|
134 |
</dependency> |
|
135 |
<dependency> |
|
136 |
<groupId>org.springframework</groupId> |
|
137 |
<artifactId>spring-context</artifactId> |
|
138 |
<version>${spring.version}</version> |
|
139 |
<scope>test</scope> |
|
140 |
</dependency> |
|
141 |
<dependency> |
|
142 |
<groupId>org.springframework</groupId> |
|
143 |
<artifactId>spring-core</artifactId> |
|
144 |
<version>${spring.version}</version> |
|
145 |
<scope>test</scope> |
|
146 |
</dependency> |
|
147 |
<dependency> |
|
148 |
<groupId>org.springframework</groupId> |
|
149 |
<artifactId>spring-test</artifactId> |
|
150 |
<version>${spring.version}</version> |
|
151 |
<scope>test</scope> |
|
152 |
</dependency> |
|
153 |
|
|
154 |
</dependencies> |
|
155 |
</project> |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/bulktag/Community.java | ||
---|---|---|
1 |
package eu.dnetlib.data.bulktag; |
|
2 |
|
|
3 |
import com.google.gson.Gson; |
|
4 |
import org.apache.commons.logging.Log; |
|
5 |
import org.apache.commons.logging.LogFactory; |
|
6 |
|
|
7 |
import java.util.ArrayList; |
|
8 |
import java.util.List; |
|
9 |
|
|
10 |
/** |
|
11 |
* Created by miriam on 01/08/2018. |
|
12 |
*/ |
|
13 |
public class Community { |
|
14 |
|
|
15 |
private static final Log log = LogFactory.getLog(Community.class); |
|
16 |
|
|
17 |
private String id; |
|
18 |
private List<String> subjects = new ArrayList<>(); |
|
19 |
private List<Datasource> datasources = new ArrayList<>(); |
|
20 |
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>(); |
|
21 |
private List<Organization> organizationCommunity = new ArrayList<>(); |
|
22 |
|
|
23 |
public List<Organization> getOrganizationCommunity() { |
|
24 |
return organizationCommunity; |
|
25 |
} |
|
26 |
|
|
27 |
public void setOrganizationCommunity(List<Organization> organizationCommunity) { |
|
28 |
this.organizationCommunity = organizationCommunity; |
|
29 |
} |
|
30 |
|
|
31 |
public String toJson() { |
|
32 |
final Gson g = new Gson(); |
|
33 |
return g.toJson(this); |
|
34 |
} |
|
35 |
|
|
36 |
public boolean isValid() { |
|
37 |
return !getSubjects().isEmpty() || !getDatasources().isEmpty() || !getZenodoCommunities().isEmpty(); |
|
38 |
} |
|
39 |
|
|
40 |
public String getId() { |
|
41 |
return id; |
|
42 |
} |
|
43 |
|
|
44 |
public void setId(String id) { |
|
45 |
this.id = id; |
|
46 |
} |
|
47 |
|
|
48 |
public List<String> getSubjects() { |
|
49 |
return subjects; |
|
50 |
} |
|
51 |
|
|
52 |
public void setSubjects(List<String> subjects) { |
|
53 |
this.subjects = subjects; |
|
54 |
} |
|
55 |
|
|
56 |
public List<Datasource> getDatasources() { |
|
57 |
return datasources; |
|
58 |
} |
|
59 |
|
|
60 |
public void setDatasources(List<Datasource> datasources) { |
|
61 |
this.datasources = datasources; |
|
62 |
} |
|
63 |
|
|
64 |
public List<ZenodoCommunity> getZenodoCommunities() { |
|
65 |
return zenodoCommunities; |
|
66 |
} |
|
67 |
|
|
68 |
public void setZenodoCommunities(List<ZenodoCommunity> zenodoCommunities) { |
|
69 |
this.zenodoCommunities = zenodoCommunities; |
|
70 |
} |
|
71 |
|
|
72 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/index/CloudIndexClient.java | ||
---|---|---|
1 |
package eu.dnetlib.data.index; |
|
2 |
|
|
3 |
import eu.dnetlib.functionality.index.solr.feed.StreamingInputDocumentFactory; |
|
4 |
import eu.dnetlib.miscutils.datetime.HumanTime; |
|
5 |
import eu.dnetlib.miscutils.functional.UnaryFunction; |
|
6 |
import org.apache.commons.logging.Log; |
|
7 |
import org.apache.commons.logging.LogFactory; |
|
8 |
import org.apache.solr.client.solrj.SolrQuery; |
|
9 |
import org.apache.solr.client.solrj.SolrServerException; |
|
10 |
import org.apache.solr.client.solrj.impl.CloudSolrClient; |
|
11 |
import org.apache.solr.client.solrj.response.QueryResponse; |
|
12 |
import org.apache.solr.client.solrj.response.UpdateResponse; |
|
13 |
import org.apache.solr.common.SolrInputDocument; |
|
14 |
|
|
15 |
import java.io.Closeable; |
|
16 |
import java.io.IOException; |
|
17 |
import java.text.SimpleDateFormat; |
|
18 |
import java.util.Date; |
|
19 |
import java.util.List; |
|
20 |
|
|
21 |
/** |
|
22 |
* Created by michele on 11/11/15. |
|
23 |
*/ |
|
24 |
public class CloudIndexClient implements Closeable { |
|
25 |
|
|
26 |
private static final Log log = LogFactory.getLog(CloudIndexClient.class); |
|
27 |
private static final String INDEX_RECORD_RESULT_FIELD = "dnetResult"; |
|
28 |
|
|
29 |
private final CloudSolrClient solrClient; |
|
30 |
|
|
31 |
protected CloudIndexClient(final CloudSolrClient solrServer) { |
|
32 |
this.solrClient = solrServer; |
|
33 |
} |
|
34 |
|
|
35 |
public int feed(final String record, final UnaryFunction<String, String> toIndexRecord) throws CloudIndexClientException { |
|
36 |
return feed(record, toIndexRecord, true); |
|
37 |
} |
|
38 |
|
|
39 |
public int feed(final String record, final UnaryFunction<String, String> toIndexRecord, final boolean commit) |
|
40 |
throws CloudIndexClientException { |
|
41 |
try { |
|
42 |
final SolrInputDocument doc = prepareSolrDocument(record, toIndexRecord); |
|
43 |
if ((doc == null) || doc.isEmpty()) throw new CloudIndexClientException("Invalid solr document"); |
|
44 |
return feed(doc, commit); |
|
45 |
} catch (final Throwable e) { |
|
46 |
throw new CloudIndexClientException("Error feeding document", e); |
|
47 |
} |
|
48 |
} |
|
49 |
|
|
50 |
public int feed(final SolrInputDocument document) throws CloudIndexClientException { |
|
51 |
return feed(document, true); |
|
52 |
} |
|
53 |
|
|
54 |
public int feed(final SolrInputDocument document, final boolean commit) throws CloudIndexClientException { |
|
55 |
try { |
|
56 |
final UpdateResponse res = solrClient.add(document); |
|
57 |
log.debug("feed time for single records, elapsed time: " + HumanTime.exactly(res.getElapsedTime())); |
|
58 |
if (res.getStatus() != 0) { throw new CloudIndexClientException("bad status: " + res.getStatus()); } |
|
59 |
if (commit) { |
|
60 |
solrClient.commit(); |
|
61 |
} |
|
62 |
return res.getStatus(); |
|
63 |
} catch (final Throwable e) { |
|
64 |
throw new CloudIndexClientException("Error feeding document", e); |
|
65 |
} |
|
66 |
} |
|
67 |
|
|
68 |
public void feed(final List<SolrInputDocument> docs, final AfterFeedingCallback callback) throws CloudIndexClientException { |
|
69 |
feed(docs, callback, true); |
|
70 |
} |
|
71 |
|
|
72 |
public void feed(final List<SolrInputDocument> docs, final AfterFeedingCallback callback, final boolean commit) throws CloudIndexClientException { |
|
73 |
try { |
|
74 |
if (docs.isEmpty()) { |
|
75 |
log.debug("Empty list of documents. Calling callback, if needed."); |
|
76 |
if (callback != null) { |
|
77 |
callback.doAfterFeeding(null); |
|
78 |
} |
|
79 |
return; |
|
80 |
} |
|
81 |
final UpdateResponse res = solrClient.add(docs); |
|
82 |
|
|
83 |
log.debug("feed time for " + docs.size() + " records, elapsed tipe: : " + HumanTime.exactly(res.getElapsedTime())); |
|
84 |
|
|
85 |
if (commit) { |
|
86 |
solrClient.commit(); |
|
87 |
} |
|
88 |
if (callback != null) { |
|
89 |
callback.doAfterFeeding(res); |
|
90 |
} |
|
91 |
if (res.getStatus() != 0) throw new CloudIndexClientException("bad status: " + res.getStatus()); |
|
92 |
} catch (final Throwable e) { |
|
93 |
throw new CloudIndexClientException("Error feeding documents", e); |
|
94 |
} |
|
95 |
} |
|
96 |
|
|
97 |
public SolrInputDocument prepareSolrDocument(final String record, final UnaryFunction<String, String> toIndexRecord) |
|
98 |
throws CloudIndexClientException { |
|
99 |
try { |
|
100 |
final StreamingInputDocumentFactory documentFactory = new StreamingInputDocumentFactory(); |
|
101 |
|
|
102 |
final String version = (new SimpleDateFormat("yyyy-MM-dd\'T\'hh:mm:ss\'Z\'")).format(new Date()); |
|
103 |
final String indexRecord = toIndexRecord.evaluate(record); |
|
104 |
|
|
105 |
if (log.isDebugEnabled()) { |
|
106 |
log.debug("***************************************\nSubmitting index record:\n" + indexRecord + "\n***************************************\n"); |
|
107 |
} |
|
108 |
|
|
109 |
return documentFactory.parseDocument(version, indexRecord, INDEX_RECORD_RESULT_FIELD); |
|
110 |
} catch (final Throwable e) { |
|
111 |
throw new CloudIndexClientException("Error creating solr document", e); |
|
112 |
} |
|
113 |
} |
|
114 |
|
|
115 |
public boolean isRecordIndexed(final String id) throws CloudIndexClientException { |
|
116 |
final QueryResponse res = query("objidentifier:\"" + id + "\"", null); |
|
117 |
return res.getResults().size() > 0; |
|
118 |
} |
|
119 |
|
|
120 |
public int remove(final String id) throws CloudIndexClientException { |
|
121 |
return remove(id, true); |
|
122 |
} |
|
123 |
|
|
124 |
public int remove(final String id, final boolean commit) throws CloudIndexClientException { |
|
125 |
String q = String.format("objidentifier:\"%s\" OR resultdupid:\"%s\"", id, id); |
|
126 |
try { |
|
127 |
final UpdateResponse res = solrClient.deleteByQuery(q); |
|
128 |
if (commit) { |
|
129 |
solrClient.commit(); |
|
130 |
} |
|
131 |
return res.getResponse().size(); |
|
132 |
} catch (final Throwable e) { |
|
133 |
throw new CloudIndexClientException("Error removing documents", e); |
|
134 |
} |
|
135 |
} |
|
136 |
|
|
137 |
public int count(final String query) throws CloudIndexClientException { |
|
138 |
final QueryResponse res = query(query, 0); |
|
139 |
return res.getResults().size(); |
|
140 |
} |
|
141 |
|
|
142 |
public QueryResponse query(final String query, Integer rows) throws CloudIndexClientException { |
|
143 |
try { |
|
144 |
final SolrQuery solrQuery = new SolrQuery(); |
|
145 |
solrQuery.setQuery(query); |
|
146 |
if(rows != null && rows >= 0) { |
|
147 |
solrQuery.setRows(rows); |
|
148 |
} |
|
149 |
return solrClient.query(solrQuery); |
|
150 |
} catch (final Throwable e) { |
|
151 |
throw new CloudIndexClientException("Error searching documents", e); |
|
152 |
} |
|
153 |
} |
|
154 |
|
|
155 |
public void close() throws IOException { |
|
156 |
if (solrClient != null) { |
|
157 |
solrClient.close(); |
|
158 |
} |
|
159 |
} |
|
160 |
|
|
161 |
public void commit() throws CloudIndexClientException { |
|
162 |
if(solrClient != null) { |
|
163 |
try { |
|
164 |
solrClient.commit(); |
|
165 |
} catch (SolrServerException | IOException e) { |
|
166 |
throw new CloudIndexClientException(e.getMessage()); |
|
167 |
} |
|
168 |
} |
|
169 |
} |
|
170 |
|
|
171 |
public interface AfterFeedingCallback { |
|
172 |
|
|
173 |
void doAfterFeeding(final UpdateResponse response); |
|
174 |
} |
|
175 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/index/CloudIndexClientFactory.java | ||
---|---|---|
1 |
package eu.dnetlib.data.index; |
|
2 |
|
|
3 |
import eu.dnetlib.functionality.index.utils.ZkServers; |
|
4 |
import org.apache.commons.logging.Log; |
|
5 |
import org.apache.commons.logging.LogFactory; |
|
6 |
import org.apache.solr.client.solrj.impl.CloudSolrClient; |
|
7 |
import org.apache.solr.client.solrj.response.SolrPingResponse; |
|
8 |
|
|
9 |
/** |
|
10 |
* Created by michele on 11/11/15. |
|
11 |
*/ |
|
12 |
public class CloudIndexClientFactory { |
|
13 |
|
|
14 |
private static final Log log = LogFactory.getLog(CloudIndexClientFactory.class); |
|
15 |
|
|
16 |
public static CloudIndexClient newIndexClient(final String baseURL, final String collection, final boolean parallelUpdates) |
|
17 |
throws CloudIndexClientException { |
|
18 |
try { |
|
19 |
log.info(String.format("Initializing solr server (%s) ...", baseURL)); |
|
20 |
|
|
21 |
final ZkServers zk = ZkServers.newInstance(baseURL); |
|
22 |
final CloudSolrClient client = new CloudSolrClient.Builder(zk.getHosts(), zk.getChroot()) |
|
23 |
.withParallelUpdates(parallelUpdates) |
|
24 |
.build(); |
|
25 |
|
|
26 |
client.connect(); |
|
27 |
client.setDefaultCollection(collection); |
|
28 |
|
|
29 |
final SolrPingResponse rsp = client.ping(); |
|
30 |
if (rsp.getStatus() != 0) { |
|
31 |
log.error("Invalid connection to solr Server (status = 0)"); |
|
32 |
throw new CloudIndexClientException("Invalid connection to solr Server (status = 0)"); |
|
33 |
} |
|
34 |
return new CloudIndexClient(client); |
|
35 |
} catch (Throwable e) { |
|
36 |
log.error("The initialization of indexClient is FAILED", e); |
|
37 |
throw new CloudIndexClientException("The initialization of indexClient is FAILED", e); |
|
38 |
} |
|
39 |
} |
|
40 |
|
|
41 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/transform/AuthorMerger.java | ||
---|---|---|
1 |
package eu.dnetlib.data.transform; |
|
2 |
|
|
3 |
import com.wcohen.ss.JaroWinkler; |
|
4 |
import eu.dnetlib.data.bulktag.Pair; |
|
5 |
import eu.dnetlib.data.proto.FieldTypeProtos.Author; |
|
6 |
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue; |
|
7 |
import eu.dnetlib.pace.model.Person; |
|
8 |
import org.apache.commons.lang3.StringUtils; |
|
9 |
|
|
10 |
import java.text.Normalizer; |
|
11 |
import java.util.*; |
|
12 |
import java.util.function.Function; |
|
13 |
|
|
14 |
import static java.util.stream.Collectors.*; |
|
15 |
|
|
16 |
public class AuthorMerger { |
|
17 |
|
|
18 |
private static final Double THRESHOLD = 0.95; |
|
19 |
private static final String ORCID = "orcid"; |
|
20 |
private static final int MAX_AUTHORS = 200; |
|
21 |
|
|
22 |
public static List<Author> merge(final Collection<List<Author>> authors, final double threshold) { |
|
23 |
return merge(authors, THRESHOLD); |
|
24 |
} |
|
25 |
|
|
26 |
public static List<Author> merge(final Collection<List<Author>> authors) { |
|
27 |
return doMerge( |
|
28 |
authors.stream() |
|
29 |
.map(group -> group.stream() |
|
30 |
.map(AuthorMerger::fixORCID) |
|
31 |
.collect(toList())) |
|
32 |
.collect(toList())); |
|
33 |
} |
|
34 |
|
|
35 |
private static List<Author> doMerge(final Collection<List<Author>> authors) { |
|
36 |
final List<Author> res = new ArrayList<>(); |
|
37 |
|
|
38 |
if (authors.isEmpty()) { |
|
39 |
return res; |
|
40 |
} |
|
41 |
|
|
42 |
if (authors.size() == 1) { |
|
43 |
return authors.iterator().next(); |
|
44 |
} |
|
45 |
|
|
46 |
final TreeMap<Integer, List<List<Author>>> byOrcidCount = new TreeMap<>( |
|
47 |
authors.stream() |
|
48 |
.collect(groupingBy(AuthorMerger::countOrcid)) |
|
49 |
.entrySet().stream() |
|
50 |
.filter(e -> e.getKey() > 0) |
|
51 |
.collect(toMap( |
|
52 |
Map.Entry::getKey, |
|
53 |
Map.Entry::getValue |
|
54 |
))); |
|
55 |
|
|
56 |
if (byOrcidCount == null || byOrcidCount.isEmpty()) { |
|
57 |
return authors.iterator().next(); |
|
58 |
} |
|
59 |
final Map.Entry<Integer, List<List<Author>>> mostOrcid = byOrcidCount.lastEntry(); |
|
60 |
|
|
61 |
if (mostOrcid.getKey() > 0) { |
|
62 |
|
|
63 |
final List<Author> pivots = mostOrcid.getValue().iterator().next(); |
|
64 |
|
|
65 |
res.addAll(mostOrcid.getValue().iterator().next().stream() |
|
66 |
.filter(a -> hasOrcid(a)) |
|
67 |
.collect(toList())); |
|
68 |
|
|
69 |
if (pivots.size() == res.size()) { |
|
70 |
return res; |
|
71 |
} |
|
72 |
|
|
73 |
final Collection<Author> authorList = authors.stream() |
|
74 |
.filter(g -> !g.equals(pivots)) |
|
75 |
.flatMap(List::stream) |
|
76 |
.filter(a -> hasOrcid(a)) |
|
77 |
.limit(MAX_AUTHORS) |
|
78 |
.map(a -> { |
|
79 |
final String orcid = a.getPidList().stream() |
|
80 |
.filter(p -> p.getKey().equalsIgnoreCase(ORCID)) |
|
81 |
.findFirst() |
|
82 |
.get().getValue(); |
|
83 |
return new Pair<String, Author>(orcid, a); |
|
84 |
}) |
|
85 |
.collect(toMap( |
|
86 |
p -> p.getFst(), |
|
87 |
p -> p.getSnd(), |
|
88 |
(p1, p2) -> p2)) |
|
89 |
.values(); |
|
90 |
|
|
91 |
pivots.stream().filter(a -> !hasOrcid(a)).forEach(pivot -> { |
|
92 |
final Author.Builder b = Author.newBuilder(pivot); |
|
93 |
authorList.parallelStream() |
|
94 |
.map(a -> { |
|
95 |
return new Pair<Double, Author>(sim(a, pivot), a); |
|
96 |
}) |
|
97 |
.filter(p -> p.getFst() >= THRESHOLD) |
|
98 |
.forEach(p -> { |
|
99 |
b.mergeFrom(p.getSnd()); |
|
100 |
}); |
|
101 |
|
|
102 |
Collection<KeyValue> pids = b.getPidList().stream() |
|
103 |
.collect(toMap( |
|
104 |
kv -> kv.getKey(), |
|
105 |
Function.identity(), |
|
106 |
(kv1, kv2) -> kv2 |
|
107 |
)).values(); |
|
108 |
b.clearPid(); |
|
109 |
b.addAllPid(pids); |
|
110 |
|
|
111 |
res.add(b.build()); |
|
112 |
}); |
|
113 |
} |
|
114 |
|
|
115 |
return res; |
|
116 |
} |
|
117 |
|
|
118 |
private static Author fixORCID(final Author author) { |
|
119 |
final Author.Builder b = Author.newBuilder(author); |
|
120 |
for(KeyValue.Builder pid : b.getPidBuilderList()) { |
|
121 |
if (pid.getKey().toLowerCase().contains(ORCID)) { |
|
122 |
pid.setKey("ORCID"); |
|
123 |
if (pid.getValue().contains("orcid.org")) { |
|
124 |
pid.setValue(StringUtils.substringAfterLast(pid.getValue(), "/")); |
|
125 |
|
|
126 |
} |
|
127 |
} |
|
128 |
} |
|
129 |
return b.build(); |
|
130 |
} |
|
131 |
|
|
132 |
private static int countOrcid(final List<Author> authors) { |
|
133 |
return authors.stream() |
|
134 |
.map(a -> { |
|
135 |
return hasOrcid(a) ? 1 : 0; |
|
136 |
}) |
|
137 |
.mapToInt(Integer::intValue) |
|
138 |
.sum(); |
|
139 |
} |
|
140 |
|
|
141 |
private static boolean hasOrcid(Author a) { |
|
142 |
return a.getPidList().stream().anyMatch(p -> p.getKey().equalsIgnoreCase(ORCID)); |
|
143 |
} |
|
144 |
|
|
145 |
private static Double sim(Author a, Author b) { |
|
146 |
|
|
147 |
final Person pa = parse(a); |
|
148 |
final Person pb = parse(b); |
|
149 |
|
|
150 |
if (pa.isAccurate() & pb.isAccurate()) { |
|
151 |
return new JaroWinkler().score( |
|
152 |
normalize(pa.getSurnameString()), |
|
153 |
normalize(pb.getSurnameString())); |
|
154 |
} else { |
|
155 |
return new JaroWinkler().score( |
|
156 |
normalize(pa.getNormalisedFullname()), |
|
157 |
normalize(pb.getNormalisedFullname())); |
|
158 |
} |
|
159 |
} |
|
160 |
|
|
161 |
private static Person parse(Author author) { |
|
162 |
if (author.hasSurname()) { |
|
163 |
return new Person(author.getSurname() + ", " + author.getName(), false); |
|
164 |
} else { |
|
165 |
return new Person(author.getFullname(), false); |
|
166 |
} |
|
167 |
} |
|
168 |
|
|
169 |
private static String normalize(final String s) { |
|
170 |
return nfd(s).toLowerCase() |
|
171 |
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings |
|
172 |
.replaceAll("(\\W)+", " ") |
|
173 |
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") |
|
174 |
.replaceAll("(\\p{Punct})+", " ") |
|
175 |
.replaceAll("(\\d)+", " ") |
|
176 |
.replaceAll("(\\n)+", " ") |
|
177 |
.trim(); |
|
178 |
} |
|
179 |
|
|
180 |
private static String nfd(final String s) { |
|
181 |
return Normalizer.normalize(s, Normalizer.Form.NFD); |
|
182 |
} |
|
183 |
|
|
184 |
|
|
185 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/index/CloudIndexClientException.java | ||
---|---|---|
1 |
package eu.dnetlib.data.index; |
|
2 |
|
|
3 |
/** |
|
4 |
* Created by michele on 23/11/15. |
|
5 |
*/ |
|
6 |
public class CloudIndexClientException extends Exception { |
|
7 |
|
|
8 |
public CloudIndexClientException(final String message) { |
|
9 |
super(message); |
|
10 |
} |
|
11 |
|
|
12 |
public CloudIndexClientException(final String message, final Throwable cause) { |
|
13 |
super(message, cause); |
|
14 |
} |
|
15 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/transform/DatePicker.java | ||
---|---|---|
1 |
package eu.dnetlib.data.transform; |
|
2 |
|
|
3 |
import eu.dnetlib.data.proto.FieldTypeProtos; |
|
4 |
import org.apache.commons.lang.StringUtils; |
|
5 |
|
|
6 |
import java.time.Year; |
|
7 |
import java.util.*; |
|
8 |
import java.util.stream.Collectors; |
|
9 |
|
|
10 |
import static java.util.Collections.reverseOrder; |
|
11 |
import static java.util.Map.Entry.comparingByValue; |
|
12 |
import static java.util.stream.Collectors.toMap; |
|
13 |
import static org.apache.commons.lang.StringUtils.endsWith; |
|
14 |
import static org.apache.commons.lang.StringUtils.substringBefore; |
|
15 |
|
|
16 |
public class DatePicker { |
|
17 |
|
|
18 |
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; |
|
19 |
private static final String DATE_DEFAULT_SUFFIX = "01-01"; |
|
20 |
private static final int YEAR_LB = 1300; |
|
21 |
private static final int YEAR_UB = Year.now().getValue() + 5; |
|
22 |
|
|
23 |
public static FieldTypeProtos.StringField pick(final Collection<String> dateofacceptance) { |
|
24 |
|
|
25 |
final Map<String, Integer> frequencies = dateofacceptance |
|
26 |
.parallelStream() |
|
27 |
.filter(StringUtils::isNotBlank) |
|
28 |
.collect( |
|
29 |
Collectors.toConcurrentMap( |
|
30 |
w -> w, w -> 1, Integer::sum)); |
|
31 |
|
|
32 |
if (frequencies.isEmpty()) { |
|
33 |
return FieldTypeProtos.StringField.newBuilder().setValue("").build(); |
|
34 |
} |
|
35 |
|
|
36 |
final FieldTypeProtos.StringField.Builder date = FieldTypeProtos.StringField.newBuilder().setValue(frequencies.keySet().iterator().next()); |
|
37 |
|
|
38 |
// let's sort this map by values first, filtering out invalid dates |
|
39 |
final Map<String, Integer> sorted = frequencies |
|
40 |
.entrySet() |
|
41 |
.stream() |
|
42 |
.filter(d -> StringUtils.isNotBlank(d.getKey())) |
|
43 |
.filter(d -> d.getKey().matches(DATE_PATTERN)) |
|
44 |
.filter(d -> inRange(d.getKey())) |
|
45 |
.sorted(reverseOrder(comparingByValue())) |
|
46 |
.collect( |
|
47 |
toMap( |
|
48 |
Map.Entry::getKey, |
|
49 |
Map.Entry::getValue, (e1, e2) -> e2, |
|
50 |
LinkedHashMap::new)); |
|
51 |
|
|
52 |
// shortcut |
|
53 |
if (sorted.size() == 0) { |
|
54 |
return date.build(); |
|
55 |
} |
|
56 |
|
|
57 |
// voting method (1/3 + 1) wins |
|
58 |
if (sorted.size() >= 3) { |
|
59 |
final int acceptThreshold = (sorted.size() / 3) + 1; |
|
60 |
final List<String> accepted = sorted.entrySet().stream() |
|
61 |
.filter(e -> e.getValue() >= acceptThreshold) |
|
62 |
.map(e -> e.getKey()) |
|
63 |
.collect(Collectors.toList()); |
|
64 |
|
|
65 |
// cannot find strong majority |
|
66 |
if (accepted.isEmpty()) { |
|
67 |
final int max = sorted.values().iterator().next(); |
|
68 |
Optional<String> first = sorted.entrySet().stream() |
|
69 |
.filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) |
|
70 |
.map(Map.Entry::getKey) |
|
71 |
.findFirst(); |
|
72 |
if (first.isPresent()) { |
|
73 |
return date.setValue(first.get()).build(); |
|
74 |
} |
|
75 |
|
|
76 |
return date.setValue(sorted.keySet().iterator().next()).build(); |
|
77 |
} |
|
78 |
|
|
79 |
if (accepted.size() == 1) { |
|
80 |
return date.setValue(accepted.get(0)).build(); |
|
81 |
} else { |
|
82 |
final Optional<String> first = accepted.stream() |
|
83 |
.filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) |
|
84 |
.findFirst(); |
|
85 |
if (first.isPresent()) { |
|
86 |
return date.setValue(first.get()).build(); |
|
87 |
} |
|
88 |
|
|
89 |
return date.build(); |
|
90 |
} |
|
91 |
|
|
92 |
//1st non YYYY-01-01 is returned |
|
93 |
} else { |
|
94 |
if (sorted.size() == 2) { |
|
95 |
for (Map.Entry<String, Integer> e : sorted.entrySet()) { |
|
96 |
if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { |
|
97 |
return date.setValue(e.getKey()).build(); |
|
98 |
} |
|
99 |
} |
|
100 |
} |
|
101 |
|
|
102 |
// none of the dates seems good enough, return the 1st one |
|
103 |
return date.setValue(sorted.keySet().iterator().next()).build(); |
|
104 |
} |
|
105 |
} |
|
106 |
|
|
107 |
private static boolean inRange(final String date) { |
|
108 |
final int year = Integer.parseInt(substringBefore(date, "-")); |
|
109 |
return year >= YEAR_LB && year <= YEAR_UB; |
|
110 |
} |
|
111 |
|
|
112 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/transform/TrustOrdering.java | ||
---|---|---|
1 |
package eu.dnetlib.data.transform; |
|
2 |
|
|
3 |
import com.google.common.collect.ImmutableList; |
|
4 |
import com.google.common.collect.Ordering; |
|
5 |
|
|
6 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
|
7 |
import eu.dnetlib.data.proto.SpecialTrustProtos.SpecialTrust; |
|
8 |
import org.apache.commons.lang3.StringUtils; |
|
9 |
|
|
10 |
public class TrustOrdering extends Ordering<Oaf> { |
|
11 |
|
|
12 |
@Override |
|
13 |
public int compare(Oaf left, Oaf right) { |
|
14 |
String lTrust = left.getDataInfo().getTrust(); |
|
15 |
String rTrust = right.getDataInfo().getTrust(); |
|
16 |
|
|
17 |
if (lTrust.equals(rTrust)) return 0; |
|
18 |
|
|
19 |
if (lTrust.equals(SpecialTrust.INFINITE.toString())) return 1; |
|
20 |
if (rTrust.equals(SpecialTrust.INFINITE.toString())) return -1; |
|
21 |
|
|
22 |
if (lTrust.equals(SpecialTrust.NEUTRAL.toString())) return 1; |
|
23 |
if (rTrust.equals(SpecialTrust.NEUTRAL.toString())) return -1; |
|
24 |
|
|
25 |
return Float.compare( |
|
26 |
Float.parseFloat(StringUtils.isBlank(lTrust) ? "0.9" : lTrust), |
|
27 |
Float.parseFloat(StringUtils.isBlank(rTrust) ? "0.9" : rTrust)); |
|
28 |
} |
|
29 |
|
|
30 |
public static ImmutableList<Oaf> sort(Iterable<Oaf> entities) { |
|
31 |
return new TrustOrdering().immutableSortedCopy(entities); |
|
32 |
} |
|
33 |
|
|
34 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/transform/OafToRowMapper.java | ||
---|---|---|
1 |
package eu.dnetlib.data.transform; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.function.Function; |
|
5 |
|
|
6 |
import com.google.common.collect.Lists; |
|
7 |
import eu.dnetlib.data.mapreduce.util.OafDecoder; |
|
8 |
import eu.dnetlib.data.mapreduce.util.OafEntityDecoder; |
|
9 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
|
10 |
|
|
11 |
public class OafToRowMapper implements Function<Oaf, List<Row>> { |
|
12 |
|
|
13 |
public static final String BODY = "body"; |
|
14 |
|
|
15 |
@Override |
|
16 |
public List<Row> apply(final Oaf oaf) { |
|
17 |
final List<Row> rows = Lists.newArrayList(); |
|
18 |
|
|
19 |
final OafDecoder d = OafDecoder.decode(oaf); |
|
20 |
final OafEntityDecoder entity = d.decodeEntity(); |
|
21 |
|
|
22 |
final Row r = new Row(d.getCFQ(), entity.getId()); |
|
23 |
switch (entity.getType()) { |
|
24 |
|
|
25 |
case project: |
|
26 |
r.addColumn(new Column<>(BODY, oaf.toByteArray())); |
|
27 |
break; |
|
28 |
case result: |
|
29 |
oaf.getEntity().getCachedOafRelList().stream() |
|
30 |
.map(cachedRel -> { |
|
31 |
final Oaf.Builder oafRel = Oaf.newBuilder(cachedRel); |
|
32 |
oafRel.getRelBuilder().clearCachedOafTarget(); |
|
33 |
return oafRel.build(); |
|
34 |
}).forEach(oafRel -> r.addColumn(new Column<>(OafDecoder.decode(oafRel).getCFQ(), oafRel.toByteArray()))); |
|
35 |
break; |
|
36 |
case datasource: |
|
37 |
|
|
38 |
break; |
|
39 |
case organization: |
|
40 |
|
|
41 |
break; |
|
42 |
} |
|
43 |
|
|
44 |
return rows; |
|
45 |
} |
|
46 |
|
|
47 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/bulktag/Datasource.java | ||
---|---|---|
1 |
package eu.dnetlib.data.bulktag; |
|
2 |
|
|
3 |
|
|
4 |
import com.google.gson.Gson; |
|
5 |
import eu.dnetlib.data.bulktag.selectioncriteria.VerbResolver; |
|
6 |
import org.apache.commons.logging.Log; |
|
7 |
import org.apache.commons.logging.LogFactory; |
|
8 |
import org.dom4j.Node; |
|
9 |
|
|
10 |
/** |
|
11 |
* Created by miriam on 01/08/2018. |
|
12 |
*/ |
|
13 |
public class Datasource { |
|
14 |
private static final Log log = LogFactory.getLog(Datasource.class); |
|
15 |
|
|
16 |
private String openaireId; |
|
17 |
|
|
18 |
private SelectionConstraints selectionConstraints; |
|
19 |
|
|
20 |
|
|
21 |
public SelectionConstraints getSelCriteria() { |
|
22 |
return selectionConstraints; |
|
23 |
} |
|
24 |
|
|
25 |
public SelectionConstraints getSelectionConstraints() { |
|
26 |
return selectionConstraints; |
|
27 |
} |
|
28 |
|
|
29 |
public void setSelectionConstraints(SelectionConstraints selectionConstraints) { |
|
30 |
this.selectionConstraints = selectionConstraints; |
|
31 |
} |
|
32 |
|
|
33 |
public void setSelCriteria(SelectionConstraints selCriteria) { |
|
34 |
this.selectionConstraints = selCriteria; |
|
35 |
} |
|
36 |
|
|
37 |
public String getOpenaireId() { |
|
38 |
return openaireId; |
|
39 |
} |
|
40 |
|
|
41 |
public void setOpenaireId(String openaireId) { |
|
42 |
this.openaireId = openaireId; |
|
43 |
} |
|
44 |
|
|
45 |
private void setSelCriteria(String json, VerbResolver resolver){ |
|
46 |
log.info("Selection constraints for datasource = " + json); |
|
47 |
selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class); |
|
48 |
|
|
49 |
selectionConstraints.setSelection(resolver); |
|
50 |
} |
|
51 |
|
|
52 |
public void setSelCriteria(Node n, VerbResolver resolver){ |
|
53 |
try{ |
|
54 |
setSelCriteria(n.getText(),resolver); |
|
55 |
}catch(Exception e) { |
|
56 |
log.info("not set selection criteria... "); |
|
57 |
selectionConstraints =null; |
|
58 |
} |
|
59 |
|
|
60 |
} |
|
61 |
|
|
62 |
|
|
63 |
|
|
64 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/bulktag/Organization.java | ||
---|---|---|
1 |
package eu.dnetlib.data.bulktag; |
|
2 |
|
|
3 |
import com.google.gson.Gson; |
|
4 |
import eu.dnetlib.data.bulktag.selectioncriteria.VerbResolver; |
|
5 |
import org.dom4j.Node; |
|
6 |
|
|
7 |
public class Organization { |
|
8 |
private String organizationId; |
|
9 |
|
|
10 |
private SelectionConstraints selCriteria; |
|
11 |
|
|
12 |
public String getOrganizationId() { |
|
13 |
return organizationId; |
|
14 |
} |
|
15 |
|
|
16 |
public void setOrganizationId(String organizationId) { |
|
17 |
this.organizationId = organizationId; |
|
18 |
} |
|
19 |
|
|
20 |
public SelectionConstraints getSelCriteria() { |
|
21 |
return selCriteria; |
|
22 |
} |
|
23 |
|
|
24 |
public void setSelCriteria(SelectionConstraints selCriteria) { |
|
25 |
this.selCriteria = selCriteria; |
|
26 |
} |
|
27 |
|
|
28 |
private void setSelCriteria(String json){ |
|
29 |
//Type collectionType = new TypeToken<Collection<Constraints>>(){}.getType(); |
|
30 |
selCriteria = new Gson().fromJson(json, SelectionConstraints.class); |
|
31 |
|
|
32 |
} |
|
33 |
|
|
34 |
public void setSelCriteria(Node n){ |
|
35 |
if (n==null){ |
|
36 |
selCriteria = null; |
|
37 |
}else{ |
|
38 |
setSelCriteria(n.getText()); |
|
39 |
} |
|
40 |
} |
|
41 |
|
|
42 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/bulktag/Constraints.java | ||
---|---|---|
1 |
package eu.dnetlib.data.bulktag; |
|
2 |
|
|
3 |
import com.google.common.reflect.TypeToken; |
|
4 |
import com.google.gson.Gson; |
|
5 |
import eu.dnetlib.data.bulktag.selectioncriteria.VerbResolver; |
|
6 |
import org.apache.commons.logging.Log; |
|
7 |
import org.apache.commons.logging.LogFactory; |
|
8 |
|
|
9 |
|
|
10 |
import java.io.Serializable; |
|
11 |
import java.lang.reflect.InvocationTargetException; |
|
12 |
import java.lang.reflect.Type; |
|
13 |
import java.util.Collection; |
|
14 |
import java.util.List; |
|
15 |
import java.util.Map; |
|
16 |
|
|
17 |
/** |
|
18 |
* Created by miriam on 02/08/2018. |
|
19 |
*/ |
|
20 |
public class Constraints implements Serializable { |
|
21 |
private static final Log log = LogFactory.getLog(Constraints.class); |
|
22 |
//private ConstraintEncapsulator ce; |
|
23 |
private List<Constraint> constraint; |
|
24 |
|
|
25 |
|
|
26 |
public Constraints() { |
|
27 |
} |
|
28 |
public List<Constraint> getConstraint() { |
|
29 |
return constraint; |
|
30 |
} |
|
31 |
|
|
32 |
public void setConstraint(List<Constraint> constraint) { |
|
33 |
this.constraint = constraint; |
|
34 |
} |
|
35 |
|
|
36 |
public void setSc(String json){ |
|
37 |
Type collectionType = new TypeToken<Collection<Constraint>>(){}.getType(); |
|
38 |
constraint = new Gson().fromJson(json, collectionType); |
|
39 |
|
|
40 |
} |
|
41 |
|
|
42 |
void setSelection(VerbResolver resolver) { |
|
43 |
for(Constraint st: constraint){ |
|
44 |
|
|
45 |
try { |
|
46 |
st.setSelection(resolver); |
|
47 |
} catch (NoSuchMethodException e) { |
|
48 |
log.error(e.getMessage()); |
|
49 |
} catch (IllegalAccessException e) { |
|
50 |
log.error(e.getMessage()); |
|
51 |
} catch (InvocationTargetException e) { |
|
52 |
log.error(e.getMessage()); |
|
53 |
} catch (InstantiationException e) { |
|
54 |
log.error(e.getMessage()); |
|
55 |
} |
|
56 |
} |
|
57 |
|
|
58 |
} |
|
59 |
|
|
60 |
|
|
61 |
//Constraint in and |
|
62 |
public boolean verifyCriteria(final Map<String, List<String>> param) { |
|
63 |
|
|
64 |
for(Constraint sc : constraint) { |
|
65 |
boolean verified = false; |
|
66 |
for(String value : param.get(sc.getField())){ |
|
67 |
if (sc.verifyCriteria(value.trim())){ |
|
68 |
verified = true; |
|
69 |
} |
|
70 |
} |
|
71 |
if(!verified) |
|
72 |
return verified; |
|
73 |
} |
|
74 |
return true; |
|
75 |
} |
|
76 |
|
|
77 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/bulktag/selectioncriteria/NotEqualVerb.java | ||
---|---|---|
1 |
package eu.dnetlib.data.bulktag.selectioncriteria; |
|
2 |
|
|
3 |
|
|
4 |
@VerbClass("not_equals") |
|
5 |
public class NotEqualVerb implements Selection { |
|
6 |
|
|
7 |
private String param; |
|
8 |
|
|
9 |
|
|
10 |
public NotEqualVerb(final String param) { |
|
11 |
this.param = param; |
|
12 |
} |
|
13 |
|
|
14 |
public NotEqualVerb() { |
|
15 |
} |
|
16 |
|
|
17 |
public String getParam() { |
|
18 |
return param; |
|
19 |
} |
|
20 |
|
|
21 |
public void setParam(String param) { |
|
22 |
this.param = param; |
|
23 |
} |
|
24 |
|
|
25 |
@Override |
|
26 |
public boolean apply(String value) { |
|
27 |
return !value.equalsIgnoreCase(param); |
|
28 |
} |
|
29 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/bulktag/selectioncriteria/VerbResolver.java | ||
---|---|---|
1 |
package eu.dnetlib.data.bulktag.selectioncriteria; |
|
2 |
|
|
3 |
import org.reflections.Reflections; |
|
4 |
|
|
5 |
import java.io.Serializable; |
|
6 |
import java.lang.reflect.InvocationTargetException; |
|
7 |
import java.util.Map; |
|
8 |
import java.util.stream.Collectors; |
|
9 |
|
|
10 |
public class VerbResolver implements Serializable { |
|
11 |
private final Map<String, Class<Selection>> map; |
|
12 |
|
|
13 |
public VerbResolver(){ |
|
14 |
this.map = new Reflections("eu.dnetlib").getTypesAnnotatedWith(VerbClass.class).stream() |
|
15 |
.collect(Collectors.toMap(v -> v.getAnnotation(VerbClass.class).value(), v->(Class<Selection>)v)); |
|
16 |
} |
|
17 |
|
|
18 |
|
|
19 |
public Selection getSelectionCriteria(String name, String param) throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, InstantiationException { |
|
20 |
|
|
21 |
return map.get(name).getDeclaredConstructor((String.class)).newInstance(param); |
|
22 |
|
|
23 |
} |
|
24 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/bulktag/Constraint.java | ||
---|---|---|
1 |
package eu.dnetlib.data.bulktag; |
|
2 |
|
|
3 |
import eu.dnetlib.data.bulktag.selectioncriteria.Selection; |
|
4 |
import eu.dnetlib.data.bulktag.selectioncriteria.VerbResolver; |
|
5 |
import org.springframework.beans.factory.annotation.Autowired; |
|
6 |
|
|
7 |
import java.io.Serializable; |
|
8 |
import java.lang.reflect.InvocationTargetException; |
|
9 |
|
|
10 |
|
|
11 |
public class Constraint implements Serializable { |
|
12 |
private String verb; |
|
13 |
private String field; |
|
14 |
private String value; |
|
15 |
private Selection selection; |
|
16 |
|
|
17 |
public Constraint() { |
|
18 |
} |
|
19 |
|
|
20 |
public String getVerb() { |
|
21 |
return verb; |
|
22 |
} |
|
23 |
|
|
24 |
public void setVerb(String verb) { |
|
25 |
this.verb = verb; |
|
26 |
} |
|
27 |
|
|
28 |
public String getField() { |
|
29 |
return field; |
|
30 |
} |
|
31 |
|
|
32 |
public void setField(String field) { |
|
33 |
this.field = field; |
|
34 |
} |
|
35 |
|
|
36 |
public String getValue() { |
|
37 |
return value; |
|
38 |
} |
|
39 |
|
|
40 |
public void setValue(String value) { |
|
41 |
this.value = value; |
|
42 |
} |
|
43 |
|
|
44 |
|
|
45 |
|
|
46 |
public void setSelection(Selection sel){ |
|
47 |
selection = sel; |
|
48 |
} |
|
49 |
|
|
50 |
public void setSelection(VerbResolver resolver) throws InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException { |
|
51 |
selection = resolver.getSelectionCriteria(verb,value); |
|
52 |
} |
|
53 |
|
|
54 |
|
|
55 |
public boolean verifyCriteria(String metadata){ |
|
56 |
return selection.apply(metadata); |
|
57 |
} |
|
58 |
|
|
59 |
|
|
60 |
|
|
61 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/mapreduce/util/RelDescriptor.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.util; |
|
2 |
|
|
3 |
import eu.dnetlib.data.proto.RelTypeProtos.RelType; |
|
4 |
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; |
|
5 |
|
|
6 |
public class RelDescriptor { |
|
7 |
|
|
8 |
public static final String SEPARATOR = "_"; |
|
9 |
|
|
10 |
private final String it; |
|
11 |
|
|
12 |
// relType also corresponds to the Ontology code |
|
13 |
private final RelType relType; |
|
14 |
|
|
15 |
private final SubRelType subRelType; |
|
16 |
|
|
17 |
private final String relClass; |
|
18 |
|
|
19 |
|
|
20 |
// <TERM code="hasAmongTopNSimilarDocuments" encoding="resultResult_similarity_hasAmongTopNSimilarDocuments" |
|
21 |
//final String rd = oafRel.getRelType().toString() + "_" + oafRel.getSubRelType() + "_" + relClasses.getInverse(oafRel.getRelClass()); |
|
22 |
//<ONTOLOGY_NAME code="dnet:result_result_relations"> |
|
23 |
public RelDescriptor(final String value) { |
|
24 |
super(); |
|
25 |
this.it = value; |
|
26 |
|
|
27 |
String[] s = value.split(SEPARATOR); |
|
28 |
|
|
29 |
this.relType = RelType.valueOf(s[0]); |
|
30 |
this.subRelType = SubRelType.valueOf(s[1]); |
|
31 |
this.relClass = s[2]; |
|
32 |
|
|
33 |
} |
|
34 |
|
|
35 |
public SubRelType getSubRelType() { |
|
36 |
return subRelType; |
|
37 |
} |
|
38 |
|
|
39 |
public RelType getRelType() { |
|
40 |
return relType; |
|
41 |
} |
|
42 |
|
|
43 |
public String getRelClass() { |
|
44 |
return relClass; |
|
45 |
} |
|
46 |
|
|
47 |
public String getIt() { |
|
48 |
return it; |
|
49 |
} |
|
50 |
|
|
51 |
|
|
52 |
@Override |
|
53 |
public String toString() { |
|
54 |
return getIt(); |
|
55 |
} |
|
56 |
|
|
57 |
@Override |
|
58 |
public int hashCode() { |
|
59 |
final int prime = 31; |
|
60 |
int result = 1; |
|
61 |
result = (prime * result) + ((it == null) ? 0 : it.hashCode()); |
|
62 |
return result; |
|
63 |
} |
|
64 |
|
|
65 |
@Override |
|
66 |
public boolean equals(final Object obj) { |
|
67 |
if (this == obj) return true; |
|
68 |
if (obj == null) return false; |
|
69 |
if (getClass() != obj.getClass()) return false; |
|
70 |
RelDescriptor other = (RelDescriptor) obj; |
|
71 |
if (it == null) { |
|
72 |
if (other.it != null) return false; |
|
73 |
} else if (!it.equals(other.it)) return false; |
|
74 |
return true; |
|
75 |
} |
|
76 |
|
|
77 |
} |
modules/dnet-openaireplus-mapping-utils/tags/dnet-openaireplus-mapping-utils-7.0.1/src/main/java/eu/dnetlib/data/transform/xml/AbstractDNetXsltFunctions.java | ||
---|---|---|
1 |
package eu.dnetlib.data.transform.xml; |
|
2 |
|
|
3 |
import java.nio.charset.Charset; |
|
4 |
import java.security.MessageDigest; |
|
5 |
import java.util.Collection; |
|
6 |
import java.util.List; |
|
7 |
import java.util.Map; |
|
8 |
import java.util.Objects; |
|
9 |
import java.util.Set; |
|
10 |
import java.util.function.Function; |
|
11 |
import java.util.stream.Collectors; |
|
12 |
|
|
13 |
import org.apache.commons.codec.binary.Base64; |
|
14 |
import org.apache.commons.codec.binary.Hex; |
|
15 |
import org.apache.commons.lang.math.NumberUtils; |
|
16 |
import org.apache.commons.lang3.StringUtils; |
|
17 |
import org.w3c.dom.NamedNodeMap; |
|
18 |
import org.w3c.dom.Node; |
|
19 |
import org.w3c.dom.NodeList; |
|
20 |
|
|
21 |
import com.google.common.base.Predicate; |
|
22 |
import com.google.common.base.Splitter; |
|
23 |
import com.google.common.collect.Lists; |
|
24 |
import com.google.common.collect.Maps; |
|
25 |
import com.google.common.collect.Sets; |
|
26 |
import com.google.gson.JsonObject; |
|
27 |
import com.google.protobuf.Descriptors.Descriptor; |
|
28 |
import com.google.protobuf.Descriptors.FieldDescriptor; |
|
29 |
import com.google.protobuf.InvalidProtocolBufferException; |
|
30 |
import com.google.protobuf.Message; |
|
31 |
import com.google.protobuf.Message.Builder; |
|
32 |
import com.google.protobuf.ProtocolMessageEnum; |
|
33 |
|
|
34 |
import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization; |
|
35 |
import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization.Provision; |
|
36 |
import eu.dnetlib.data.proto.DedupProtos.Dedup; |
|
37 |
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity; |
|
38 |
import eu.dnetlib.data.proto.FieldTypeProtos.BoolField; |
|
39 |
import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo; |
|
40 |
import eu.dnetlib.data.proto.FieldTypeProtos.IntField; |
|
41 |
import eu.dnetlib.data.proto.FieldTypeProtos.Journal; |
|
42 |
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue; |
|
43 |
import eu.dnetlib.data.proto.FieldTypeProtos.OAIProvenance; |
|
44 |
import eu.dnetlib.data.proto.FieldTypeProtos.OAIProvenance.OriginDescription; |
|
45 |
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; |
|
46 |
import eu.dnetlib.data.proto.FieldTypeProtos.StringField; |
|
47 |
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; |
|
48 |
import eu.dnetlib.data.proto.KindProtos.Kind; |
|
49 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
|
50 |
import eu.dnetlib.data.proto.OafProtos.OafEntity; |
|
51 |
import eu.dnetlib.data.proto.OafProtos.OafRel; |
|
52 |
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization; |
|
53 |
import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization; |
|
54 |
import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization.Participation; |
|
55 |
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata; |
|
56 |
import eu.dnetlib.data.proto.RelTypeProtos.RelType; |
|
57 |
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; |
|
58 |
import eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization; |
|
59 |
import eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization.Affiliation; |
|
60 |
import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject; |
|
61 |
import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject.Outcome; |
|
62 |
import eu.dnetlib.data.proto.ResultProtos.Result.Metadata; |
|
63 |
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult; |
|
64 |
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.Part; |
|
65 |
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.PublicationDataset; |
|
66 |
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.Similarity; |
|
67 |
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.Supplement; |
|
68 |
import eu.dnetlib.data.proto.TypeProtos.Type; |
|
69 |
import eu.dnetlib.miscutils.collections.Pair; |
|
70 |
import eu.dnetlib.miscutils.iterators.IterablePair; |
|
71 |
|
|
72 |
public abstract class AbstractDNetXsltFunctions { |
|
73 |
|
|
74 |
public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; |
|
75 |
private static final int MAX_NSPREFIX_LEN = 12; |
|
76 |
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX); |
|
77 |
public static Map<String, String> code2name = Maps.newHashMap(); |
|
78 |
|
|
79 |
/* |
|
80 |
* Obtained via COPY (select code, name from class) TO '/tmp/class_scheme.csv' (FORMAT csv, delimiter ',', FORCE_QUOTE *); on the |
|
81 |
* relational db |
|
82 |
*/ |
|
83 |
// code2name.put("openaire2.0_data","OpenAIRE Data (funded, referenced datasets)"); |
|
84 |
static { |
|
85 |
code2name.put("MH", "Marshall Islands"); |
|
86 |
code2name.put("CF", "Central African Republic"); |
|
87 |
code2name.put("TD", "Chad"); |
|
88 |
code2name.put("CN", "China (People's Republic of)"); |
|
89 |
code2name.put("NG", "Nigeria"); |
|
90 |
code2name.put("NF", "Norfolk Island"); |
|
91 |
code2name.put("MP", "Northern Mariana Islands"); |
|
92 |
code2name.put("PS", "Palestinian-administered areas"); |
|
93 |
code2name.put("SZ", "Swaziland"); |
|
94 |
code2name.put("max", "Manx"); |
|
95 |
code2name.put("TW", "Taiwan"); |
|
96 |
code2name.put("TJ", "Tajikistan"); |
|
97 |
code2name.put("BSG", "Research for the benefit of specific groups"); |
|
98 |
code2name.put("CP", "Collaborative project"); |
|
99 |
code2name.put("12MONTHS", "12 Months Embargo"); |
|
100 |
code2name.put("ace", "Achinese"); |
|
101 |
code2name.put("egy", "Ancient Egyptian"); |
|
102 |
code2name.put("ara", "Arabic"); |
|
103 |
code2name.put("arc", "Aramaic"); |
|
104 |
code2name.put("arp", "Arapaho"); |
|
105 |
code2name.put("gon", "Gondi"); |
|
106 |
code2name.put("ine", "Indo-European"); |
|
107 |
code2name.put("ipk", "Inupiaq"); |
|
108 |
code2name.put("ira", "Iranian"); |
|
109 |
code2name.put("lim", "Limburgan; Limburger; Limburgish"); |
|
110 |
code2name.put("mni", "Manipuri"); |
|
111 |
code2name.put("mno", "Manobo"); |
|
112 |
code2name.put("men", "Mende"); |
|
113 |
code2name.put("CX", "Christmas Island"); |
|
114 |
code2name.put("CC", "Cocos (Keeling) Islands"); |
|
115 |
code2name.put("KM", "Comoros"); |
|
116 |
code2name.put("CG", "Congo"); |
|
117 |
code2name.put("CK", "Cook Islands"); |
|
118 |
code2name.put("HR", "Croatia"); |
|
119 |
code2name.put("arn", "Araucanian"); |
|
120 |
code2name.put("art", "Artificial"); |
|
121 |
code2name.put("nah", "Aztec"); |
|
122 |
code2name.put("bug", "Buginese"); |
|
123 |
code2name.put("chn", "Chinook jargon"); |
|
124 |
code2name.put("chv", "Chuvash"); |
|
125 |
code2name.put("mus", "Creek"); |
|
126 |
code2name.put("mic", "Micmac"); |
|
127 |
code2name.put("min", "Minangkabau"); |
|
128 |
code2name.put("fro", "Old French"); |
|
129 |
code2name.put("cpp", "Portuguese-based Creoles and Pidgins"); |
|
130 |
code2name.put("som", "Somali"); |
|
131 |
code2name.put("wen", "Sorbian"); |
|
132 |
code2name.put("hrv", "Croatian"); |
|
133 |
code2name.put("cus", "Cushitic"); |
|
134 |
code2name.put("sot", "Sotho, Southern"); |
|
135 |
code2name.put("sai", "South American Indian"); |
|
136 |
code2name.put("esl/spa", "Spanish"); |
|
137 |
code2name.put("CU", "Cuba"); |
|
138 |
code2name.put("CW", "Curaçao"); |
|
139 |
code2name.put("CZ", "Czech Republic"); |
|
140 |
code2name.put("DK", "Denmark"); |
|
141 |
code2name.put("ER", "Eritrea"); |
|
142 |
code2name.put("TF", "French Southern Territories"); |
|
143 |
code2name.put("GW", "Guinea-Bissau"); |
|
144 |
code2name.put("VA", "Holy See (Vatican City State)"); |
|
145 |
code2name.put("BO", "Bolivia"); |
|
146 |
code2name.put("KY", "Cayman Islands"); |
|
147 |
code2name.put("dra", "Dravidian"); |
|
148 |
code2name.put("cpe", "English-based Creoles and Pidgins"); |
|
149 |
code2name.put("oji", "Ojibwa"); |
|
150 |
code2name.put("CIP-EIP-TN", "CIP-Eco-Innovation - CIP-Thematic Network"); |
|
151 |
code2name.put("jav/jaw", "Javanese"); |
|
152 |
code2name.put("ach", "Acoli"); |
|
153 |
code2name.put("ada", "Adangme"); |
|
154 |
code2name.put("afh", "Afrihili"); |
|
155 |
code2name.put("afr", "Afrikaans"); |
|
156 |
code2name.put("afa", "Afro-Asiatic"); |
|
157 |
code2name.put("ale", "Aleut"); |
|
158 |
code2name.put("alg", "Algonquian languages"); |
|
159 |
code2name.put("arw", "Arawak"); |
|
160 |
code2name.put("asm", "Assamese"); |
|
161 |
code2name.put("ava", "Avaric"); |
|
162 |
code2name.put("ave", "Avestan"); |
|
163 |
code2name.put("bra", "Braj"); |
|
164 |
code2name.put("bua", "Buriat"); |
|
165 |
code2name.put("chr", "Cherokee"); |
|
166 |
code2name.put("chy", "Cheyenne"); |
|
167 |
code2name.put("jrb", "Judeo-Arabic"); |
|
168 |
code2name.put("jpr", "Judeo-Persian"); |
|
169 |
code2name.put("kab", "Kabyle"); |
|
170 |
code2name.put("kac", "Kachin"); |
|
171 |
code2name.put("kaa", "Kara-Kalpak"); |
|
172 |
code2name.put("loz", "Lozi"); |
|
173 |
code2name.put("mwr", "Marwari"); |
|
174 |
code2name.put("DJ", "Djibouti"); |
|
175 |
code2name.put("JM", "Jamaica"); |
|
176 |
code2name.put("JP", "Japan"); |
|
177 |
code2name.put("JE", "Jersey"); |
|
178 |
code2name.put("JO", "Jordan"); |
|
179 |
code2name.put("KZ", "Kazakhstan"); |
|
180 |
code2name.put("KE", "Kenya"); |
|
181 |
code2name.put("KI", "Kiribati"); |
|
182 |
code2name.put("KR", "Korea (Republic of)"); |
|
183 |
code2name.put("KP", "Korea, Democatric People's Republic of"); |
|
184 |
code2name.put("XK", "Kosovo * UN resolution"); |
|
185 |
code2name.put("KW", "Kuwait"); |
|
186 |
code2name.put("NL", "Netherlands"); |
|
187 |
code2name.put("PE", "Peru"); |
|
188 |
code2name.put("PH", "Philippines"); |
|
189 |
code2name.put("fre/fra", "French"); |
|
190 |
code2name.put("PL", "Poland"); |
|
191 |
code2name.put("PT", "Portugal"); |
|
192 |
code2name.put("PR", "Puerto Rico"); |
|
193 |
code2name.put("QA", "Qatar"); |
|
194 |
code2name.put("RO", "Romania"); |
|
195 |
code2name.put("RU", "Russian Federation"); |
|
196 |
code2name.put("RW", "Rwanda"); |
|
197 |
code2name.put("RE", "Réunion"); |
|
198 |
code2name.put("sve/swe", "Swedish"); |
|
199 |
code2name.put("myn", "Mayan"); |
|
200 |
code2name.put("dum", "Middle Dutch"); |
|
201 |
code2name.put("mun", "Munda"); |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-openaireplus-mapping-utils-7.0.1