Revision 55760
Added by Claudio Atzori almost 5 years ago
modules/dnet-openaireplus-mapping-utils/trunk/src/test/java/eu/dnetlib/data/bulktag/CommunityConfigurationFactoryTest.java | ||
---|---|---|
27 | 27 |
public void parseTest() throws DocumentException { |
28 | 28 |
|
29 | 29 |
final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); |
30 |
assertEquals(cc.size(),4);
|
|
30 |
assertEquals(5,cc.size());
|
|
31 | 31 |
cc.getCommunityList().forEach(c -> assertTrue(StringUtils.isNoneBlank(c.getId()))); |
32 | 32 |
|
33 | 33 |
|
modules/dnet-openaireplus-mapping-utils/trunk/src/test/resources/eu/dnetlib/data/bulktag/community_configuration.xml | ||
---|---|---|
1 | 1 |
<communities> |
2 | 2 |
<community id="fet-fp7"> |
3 |
<oacommunity/> |
|
3 | 4 |
<subjects/> |
4 | 5 |
<datasources/> |
5 | 6 |
<zenodocommunities/> |
... | ... | |
161 | 162 |
</datasources> |
162 | 163 |
<zenodocommunities/> |
163 | 164 |
</community> |
165 |
<community id="clarin"> |
|
166 |
<oacommunity>oac_clarin</oacommunity> |
|
167 |
<subjects/> |
|
168 |
<datasources> |
|
169 |
<datasource> |
|
170 |
<openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId> |
|
171 |
<selcriteria/> |
|
172 |
</datasource> |
|
173 |
</datasources> |
|
174 |
<zenodocommunities/> |
|
175 |
</community> |
|
164 | 176 |
</communities> |
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/mapreduce/util/OafEntityDecoder.java | ||
---|---|---|
51 | 51 |
return field; |
52 | 52 |
} |
53 | 53 |
|
54 |
public List<String> getFieldValues(final String path) { |
|
55 |
return processPath(getOafEntity(), path, eu.dnetlib.pace.config.Type.String).stream() |
|
56 |
.map(o -> o.toString()) |
|
57 |
.collect(Collectors.toCollection(LinkedList::new)); |
|
58 |
} |
|
59 |
|
|
60 |
|
|
61 | 54 |
public String getDateOfCollection() { |
62 | 55 |
return oafEntity.getDateofcollection(); |
63 | 56 |
} |
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/index/CloudIndexClientFactory.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.index; |
2 | 2 |
|
3 |
import eu.dnetlib.functionality.index.utils.ZkServers; |
|
3 | 4 |
import org.apache.commons.logging.Log; |
4 | 5 |
import org.apache.commons.logging.LogFactory; |
5 |
import org.apache.solr.client.solrj.impl.CloudSolrServer;
|
|
6 |
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
|
6 | 7 |
import org.apache.solr.client.solrj.response.SolrPingResponse; |
7 | 8 |
|
8 | 9 |
/** |
... | ... | |
15 | 16 |
public static CloudIndexClient newIndexClient(final String baseURL, final String collection, final boolean parallelUpdates) |
16 | 17 |
throws CloudIndexClientException { |
17 | 18 |
try { |
18 |
final CloudSolrServer client = new CloudSolrServer(baseURL); |
|
19 |
|
|
20 | 19 |
log.info(String.format("Initializing solr server (%s) ...", baseURL)); |
21 | 20 |
|
21 |
final ZkServers zk = ZkServers.newInstance(baseURL); |
|
22 |
final CloudSolrClient client = new CloudSolrClient.Builder(zk.getHosts(), zk.getChroot()) |
|
23 |
.withParallelUpdates(parallelUpdates) |
|
24 |
.build(); |
|
25 |
|
|
22 | 26 |
client.connect(); |
23 |
|
|
24 |
client.setParallelUpdates(parallelUpdates); |
|
25 | 27 |
client.setDefaultCollection(collection); |
26 | 28 |
|
27 | 29 |
final SolrPingResponse rsp = client.ping(); |
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/index/CloudIndexClient.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.index; |
2 | 2 |
|
3 |
import java.io.Closeable; |
|
4 |
import java.io.IOException; |
|
5 |
import java.text.SimpleDateFormat; |
|
6 |
import java.util.Date; |
|
7 |
import java.util.List; |
|
8 |
|
|
9 | 3 |
import eu.dnetlib.functionality.index.solr.feed.StreamingInputDocumentFactory; |
10 | 4 |
import eu.dnetlib.miscutils.datetime.HumanTime; |
11 | 5 |
import eu.dnetlib.miscutils.functional.UnaryFunction; |
... | ... | |
13 | 7 |
import org.apache.commons.logging.LogFactory; |
14 | 8 |
import org.apache.solr.client.solrj.SolrQuery; |
15 | 9 |
import org.apache.solr.client.solrj.SolrServerException; |
16 |
import org.apache.solr.client.solrj.impl.CloudSolrServer;
|
|
10 |
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
|
17 | 11 |
import org.apache.solr.client.solrj.response.QueryResponse; |
18 | 12 |
import org.apache.solr.client.solrj.response.UpdateResponse; |
19 | 13 |
import org.apache.solr.common.SolrInputDocument; |
20 | 14 |
|
15 |
import java.io.Closeable; |
|
16 |
import java.io.IOException; |
|
17 |
import java.text.SimpleDateFormat; |
|
18 |
import java.util.Date; |
|
19 |
import java.util.List; |
|
20 |
|
|
21 | 21 |
/** |
22 | 22 |
* Created by michele on 11/11/15. |
23 | 23 |
*/ |
... | ... | |
26 | 26 |
private static final Log log = LogFactory.getLog(CloudIndexClient.class); |
27 | 27 |
private static final String INDEX_RECORD_RESULT_FIELD = "dnetResult"; |
28 | 28 |
|
29 |
private final CloudSolrServer solrClient;
|
|
29 |
private final CloudSolrClient solrClient;
|
|
30 | 30 |
|
31 |
protected CloudIndexClient(final CloudSolrServer solrServer) {
|
|
31 |
protected CloudIndexClient(final CloudSolrClient solrServer) {
|
|
32 | 32 |
this.solrClient = solrServer; |
33 | 33 |
} |
34 | 34 |
|
... | ... | |
153 | 153 |
|
154 | 154 |
public void close() throws IOException { |
155 | 155 |
if (solrClient != null) { |
156 |
solrClient.shutdown();
|
|
156 |
solrClient.close();
|
|
157 | 157 |
} |
158 | 158 |
} |
159 | 159 |
|
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/bulktag/CommunityConfigurationFactory.java | ||
---|---|---|
99 | 99 |
} |
100 | 100 |
|
101 | 101 |
private static List<ZenodoCommunity> parseZenodoCommunities(final Node node) { |
102 |
final Node oacommunitynode = node.selectSingleNode("./oacommunity"); |
|
103 |
String oacommunity = null; |
|
104 |
if (oacommunitynode != null){ |
|
105 |
String tmp = oacommunitynode.getText(); |
|
106 |
if(StringUtils.isNotBlank(tmp)) |
|
107 |
oacommunity = tmp; |
|
108 |
} |
|
102 | 109 |
|
110 |
|
|
103 | 111 |
final List<Node> list = node.selectNodes("./zenodocommunities/zenodocommunity"); |
104 | 112 |
final List<ZenodoCommunity> zenodoCommunityList = new ArrayList<>(); |
105 | 113 |
for(Node n : list){ |
... | ... | |
109 | 117 |
|
110 | 118 |
zenodoCommunityList.add(zc); |
111 | 119 |
} |
120 |
if(oacommunity != null){ |
|
121 |
ZenodoCommunity zc = new ZenodoCommunity(); |
|
122 |
zc.setZenodoCommunityId(oacommunity); |
|
123 |
zenodoCommunityList.add(zc); |
|
124 |
} |
|
112 | 125 |
log.info("size of the zenodo community list " + zenodoCommunityList.size()); |
113 | 126 |
return zenodoCommunityList; |
114 | 127 |
} |
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/bulktag/CommunityConfiguration.java | ||
---|---|---|
66 | 66 |
for(ZenodoCommunity zc : c.getZenodoCommunities()){ |
67 | 67 |
add(zc.getZenodoCommunityId(),new Pair<>(id,zc.getSelCriteria()),zenodocommunityMap); |
68 | 68 |
} |
69 |
|
|
69 | 70 |
} |
70 | 71 |
} |
71 | 72 |
|
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/transform/xml/AbstractDNetXsltFunctions.java | ||
---|---|---|
1463 | 1463 |
|
1464 | 1464 |
|
1465 | 1465 |
|
1466 |
private static final Set<String> invalidPidTypes = Sets.newHashSet("distributionlocation", "url", " ", "local accession id", "local", "landingpage"); |
|
1467 |
|
|
1466 |
private static final Set<String> invalidPidTypes = |
|
1467 |
Sets.newHashSet("distributionlocation", "url", " ", "local accession id", "local", "local id", "a local accession number", "landingpage", "publisherid", "report number", "uri", "contract", "doc", |
|
1468 |
"issn", "issn (online)", "issn (print)", "eissn", "citation", "unknown", "other", "oai", "case number", "section", "series", "report", |
|
1469 |
"other numbers", "site id", "fulltext", "internal", "report numbers", "product number", "depositor id", "isbn13", "doe contract number", "revision", |
|
1470 |
"issue", "pages", "volume", "another identifier for this resource", "csvdownload", "hepdatarecord", "hepdatarecordalt", "rootdownload", "yamldownload", "yodadownload", |
|
1471 |
"md5", "firstid", "uuid", "poster number", "compactidentifiers", "sample_id", "source identifier", "lod-catalog", "internal id", "funder", "department", |
|
1472 |
"odin doi viewer", "odin matdb viewer", "bitstream", "dipartimento", "technical note (national research council of canada. division of building research) series", |
|
1473 |
"internal report (national research council canada. division of building research) series", "dk.dda.ddieditor.version", "extended kim id", "kim id", "ccin", |
|
1474 |
"dk.dda.study.annonymizeddata", "e-issn", "call number", "sequenza"); |
|
1468 | 1475 |
protected static List<StructuredProperty> parsePids(final NodeList nodelist) { |
1469 | 1476 |
|
1470 | 1477 |
final List<StructuredProperty> pids = Lists.newArrayList(); |
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/transform/xml/OdfToHbaseXsltFunctions.java | ||
---|---|---|
17 | 17 |
import eu.dnetlib.data.proto.ResultProtos.Result; |
18 | 18 |
import eu.dnetlib.data.proto.ResultProtos.Result.Context; |
19 | 19 |
import eu.dnetlib.data.proto.ResultProtos.Result.Instance; |
20 |
import eu.dnetlib.data.proto.ResultProtos.Result.Metadata.Builder; |
|
20 | 21 |
import eu.dnetlib.data.proto.TypeProtos.Type; |
21 | 22 |
import org.apache.commons.lang3.StringUtils; |
22 | 23 |
import org.w3c.dom.Element; |
... | ... | |
143 | 144 |
|
144 | 145 |
final NodeList creatorNames = creator.getElementsByTagName("creatorName"); |
145 | 146 |
if (creatorNames.getLength() > 0) { |
146 |
final Element creatorName = (Element) creatorNames.item(0); |
|
147 |
|
|
148 |
final Author.Builder author = Author.newBuilder(); |
|
149 |
author.setRank(i+1); |
|
150 |
final String fullname = StringUtils.trim(creatorName.getTextContent()); |
|
151 |
|
|
152 |
author.setFullname(fullname); |
|
153 |
|
|
154 |
final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false); |
|
155 |
if (p.isAccurate()) { |
|
156 |
author.setName(p.getNormalisedFirstName()); |
|
157 |
author.setSurname(p.getNormalisedSurname()); |
|
147 |
createAuthor(metadataProto, i, creator, creatorNames); |
|
148 |
} else{ |
|
149 |
//handle authors with namespaceprefix |
|
150 |
final NodeList creatorNamesNs = creator.getElementsByTagNameNS("http://datacite.org/schema/kernel-4", "creatorName"); |
|
151 |
if (creatorNamesNs.getLength() > 0) { |
|
152 |
createAuthor(metadataProto, i, creator, creatorNamesNs); |
|
158 | 153 |
} |
159 |
final NodeList nameIdentifiers = creator.getElementsByTagName("nameIdentifier"); |
|
160 |
if (nameIdentifiers.getLength() > 0) { |
|
161 |
final Element nameIdentifier = (Element) nameIdentifiers.item(0); |
|
162 |
final String nameIdentifierScheme = nameIdentifier.getAttribute("nameIdentifierScheme"); |
|
163 |
final String id = StringUtils.trim(nameIdentifier.getTextContent()); |
|
164 |
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(nameIdentifierScheme)) { |
|
165 |
author.addPid(getKV(nameIdentifierScheme, id)); |
|
166 |
} |
|
167 |
} |
|
168 | 154 |
|
169 |
addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("author"), author); |
|
170 | 155 |
} |
171 | 156 |
} |
172 | 157 |
} |
... | ... | |
349 | 334 |
addField(instance, Instance.getDescriptor().findFieldByName("instancetype"), |
350 | 335 |
setQualifier(getDefaultQualifier("dnet:dataCite_resource"), Lists.newArrayList(cobjcategoryCode))); |
351 | 336 |
|
352 |
addField(instance, Instance.getDescriptor().findFieldByName("url"), instanceUri); |
|
353 | 337 |
if (StringUtils.isNotBlank(landingPage)) { |
354 | 338 |
addField(instance, Instance.getDescriptor().findFieldByName("url"), landingPage); |
355 | 339 |
} |
340 |
//sometimes the instanceUri is blank... |
|
341 |
if (StringUtils.isNotBlank(instanceUri)) { |
|
342 |
addField(instance, Instance.getDescriptor().findFieldByName("url"), instanceUri); |
|
343 |
} |
|
344 |
|
|
356 | 345 |
addField(instance, Instance.getDescriptor().findFieldByName("distributionlocation"), getFirstItem(distributionlocation)); |
357 | 346 |
|
358 | 347 |
addField(instance, Instance.getDescriptor().findFieldByName("collectedfrom"), collectedFroms); |
... | ... | |
386 | 375 |
|
387 | 376 |
} |
388 | 377 |
|
378 |
private static void createAuthor(final Builder metadataProto, final int i, final Element creator, final NodeList creatorNames) { |
|
379 |
final Element creatorName = (Element) creatorNames.item(0); |
|
380 |
|
|
381 |
final Author.Builder author = Author.newBuilder(); |
|
382 |
author.setRank(i+1); |
|
383 |
final String fullname = StringUtils.trim(creatorName.getTextContent()); |
|
384 |
|
|
385 |
author.setFullname(fullname); |
|
386 |
|
|
387 |
final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false); |
|
388 |
if (p.isAccurate()) { |
|
389 |
author.setName(p.getNormalisedFirstName()); |
|
390 |
author.setSurname(p.getNormalisedSurname()); |
|
391 |
} |
|
392 |
final NodeList nameIdentifiers = creator.getElementsByTagName("nameIdentifier"); |
|
393 |
if (nameIdentifiers.getLength() > 0) { |
|
394 |
final Element nameIdentifier = (Element) nameIdentifiers.item(0); |
|
395 |
final String nameIdentifierScheme = nameIdentifier.getAttribute("nameIdentifierScheme"); |
|
396 |
final String id = StringUtils.trim(nameIdentifier.getTextContent()); |
|
397 |
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(nameIdentifierScheme)) { |
|
398 |
author.addPid(getKV(nameIdentifierScheme, id)); |
|
399 |
} |
|
400 |
} |
|
401 |
|
|
402 |
addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("author"), author); |
|
403 |
} |
|
404 |
|
|
389 | 405 |
private static String getResultType(final NodeList cobjcategoryNode) { |
390 | 406 |
|
391 | 407 |
final ValueMap values = ValueMap.parseNodeList(cobjcategoryNode); |
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/transform/AbstractProtoMapper.java | ||
---|---|---|
50 | 50 |
* |
51 | 51 |
* @param proto |
52 | 52 |
* the proto |
53 |
* @param fieldDef |
|
54 |
* the field definition descriptor |
|
55 | 53 |
* @param path |
56 |
* the path
|
|
54 |
* the path
|
|
57 | 55 |
* @return the list |
58 | 56 |
*/ |
59 | 57 |
protected List<Object> processPath(final GeneratedMessage proto, final FieldDef fieldDef, final String path) { |
... | ... | |
65 | 63 |
* |
66 | 64 |
* @param proto |
67 | 65 |
* the proto |
68 |
* @param path |
|
69 |
* the path |
|
70 |
* @param type |
|
71 |
* the type |
|
72 |
* @return the list |
|
73 |
*/ |
|
74 |
protected List<Object> processPath(final GeneratedMessage proto, final String path, final Type type) { |
|
75 |
final FieldDef fieldDef = new FieldDef(); |
|
76 |
fieldDef.setType(type); |
|
77 |
return processPath(proto, fieldDef, Lists.newLinkedList(Splitter.on(PATH_SEPARATOR).trimResults().split(path))); |
|
78 |
} |
|
79 |
|
|
80 |
/** |
|
81 |
* Process path. |
|
82 |
* |
|
83 |
* @param proto |
|
84 |
* the proto |
|
85 | 66 |
* @param pathElements |
86 | 67 |
* the list |
87 | 68 |
* @return the list |
modules/dnet-openaireplus-mapping-utils/trunk/pom.xml | ||
---|---|---|
10 | 10 |
<groupId>eu.dnetlib</groupId> |
11 | 11 |
<artifactId>dnet-openaireplus-mapping-utils</artifactId> |
12 | 12 |
<packaging>jar</packaging> |
13 |
<version>6.2.29-SNAPSHOT</version>
|
|
13 |
<version>6.2.24-SNAPSHOT</version>
|
|
14 | 14 |
<scm> |
15 | 15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-openaireplus-mapping-utils/trunk</developerConnection> |
16 | 16 |
</scm> |
... | ... | |
106 | 106 |
<dependency> |
107 | 107 |
<groupId>eu.dnetlib</groupId> |
108 | 108 |
<artifactId>dnet-index-solr-common</artifactId> |
109 |
<version>[1.0.0,1.3.3]</version> |
|
110 |
<!-- uncomment to include solrj 7.2.0 --> |
|
111 |
<!--<version>[1.0.0,2.0.0]</version>--> |
|
109 |
<version>[2.3.3-solr75]</version> |
|
112 | 110 |
</dependency> |
113 | 111 |
<dependency> |
114 | 112 |
<groupId>com.googlecode.protobuf-java-format</groupId> |
Also available in: Unified diff
reintegrated branch solr75 -r53766:HEAD