Revision 50818
Added by Sandro La Bruzzo over 6 years ago
modules/dnet-pid-resolver/trunk/pom.xml | ||
---|---|---|
42 | 42 |
<version>1.8.5</version> |
43 | 43 |
<scope>test</scope> |
44 | 44 |
</dependency> |
45 |
<dependency> |
|
46 |
<groupId>eu.dnetlib</groupId> |
|
47 |
<artifactId>dnet-msro-service</artifactId> |
|
48 |
<version>5.0.0-SNAPSHOT</version> |
|
49 |
</dependency> |
|
45 | 50 |
</dependencies> |
46 | 51 |
</project> |
webapps/dnet-dli-container/trunk/src/main/resources/eu/dnetlib/cnr-site.properties | ||
---|---|---|
66 | 66 |
#dnet.logger.mongo.host=localhost |
67 | 67 |
dnet.logger.mongo.port=27017 |
68 | 68 |
dnet.modular.ui.authorization.mongo.host=playground-t.dnet.d4science.org |
69 |
services.mdstore.mongodb.host=playground-t.dnet.d4science.org |
|
69 |
#services.mdstore.mongodb.host=playground-t.dnet.d4science.org
|
|
70 | 70 |
#dnet.modular.ui.authorization.mongo.host=localhost |
71 |
#services.mdstore.mongodb.host=localhost |
|
72 |
services.dli.resolver.store.DatabaseName=resolvedStore |
|
71 |
services.mdstore.mongodb.host=localhost |
|
72 |
services.dli.resolver.store.DatabaseName=dliResolvedStore |
|
73 |
|
|
74 |
services.dli.resolver.crossRef.dump=ES |
modules/dnet-msro-service/trunk/src/main/java/eu/dnetlib/msro/workflows/nodes/transform/IncrementalTransformationJobNode.java | ||
---|---|---|
1 |
package eu.dnetlib.msro.workflows.nodes.transform; |
|
2 |
|
|
3 |
import eu.dnetlib.enabling.locators.UniqueServiceLocator; |
|
4 |
import eu.dnetlib.msro.logging.DnetLogger; |
|
5 |
import eu.dnetlib.msro.workflows.graph.Arc; |
|
6 |
import eu.dnetlib.msro.workflows.nodes.DateProcessUtils; |
|
7 |
import eu.dnetlib.msro.workflows.nodes.SimpleJobNode; |
|
8 |
import eu.dnetlib.msro.workflows.procs.Env; |
|
9 |
import eu.dnetlib.msro.workflows.procs.ProcessAware; |
|
10 |
import eu.dnetlib.msro.workflows.procs.WorkflowProcess; |
|
11 |
import eu.dnetlib.msro.workflows.util.WorkflowsConstants; |
|
12 |
import org.apache.commons.logging.Log; |
|
13 |
import org.apache.commons.logging.LogFactory; |
|
14 |
import org.springframework.beans.factory.annotation.Autowired; |
|
15 |
|
|
16 |
import java.util.HashMap; |
|
17 |
import java.util.Map; |
|
18 |
|
|
19 |
public class IncrementalTransformationJobNode extends SimpleJobNode implements ProcessAware { |
|
20 |
|
|
21 |
enum TransformationType { |
|
22 |
INCREMENTAL, |
|
23 |
REFRESH |
|
24 |
} |
|
25 |
|
|
26 |
private static final Log log = LogFactory.getLog(IncrementalTransformationJobNode.class); |
|
27 |
|
|
28 |
private String transformationType; |
|
29 |
|
|
30 |
private WorkflowProcess process; |
|
31 |
|
|
32 |
@Autowired |
|
33 |
private DnetLogger dnetLogger; |
|
34 |
|
|
35 |
@Autowired |
|
36 |
private UniqueServiceLocator locator; |
|
37 |
|
|
38 |
@Override |
|
39 |
protected String execute(Env env) throws Exception { |
|
40 |
|
|
41 |
if (TransformationType.INCREMENTAL.toString().equalsIgnoreCase(getTransformationType())) { |
|
42 |
final String endDate = DateProcessUtils.getEndDate(process, dnetLogger); |
|
43 |
if (endDate == null ) |
|
44 |
{ |
|
45 |
env.setAttribute("collectionMode", TransformationType.REFRESH.toString()); |
|
46 |
} |
|
47 |
else { |
|
48 |
env.setAttribute("collectionMode", TransformationType.INCREMENTAL.toString()); |
|
49 |
env.setAttribute("incrementalDateFrom", endDate); |
|
50 |
} |
|
51 |
} |
|
52 |
return Arc.DEFAULT_ARC; |
|
53 |
} |
|
54 |
|
|
55 |
|
|
56 |
|
|
57 |
|
|
58 |
|
|
59 |
public void setTransformationType(String transformationType) { |
|
60 |
this.transformationType = transformationType; |
|
61 |
} |
|
62 |
|
|
63 |
public String getTransformationType() { |
|
64 |
return transformationType; |
|
65 |
} |
|
66 |
|
|
67 |
@Override |
|
68 |
public void setProcess(WorkflowProcess process) { |
|
69 |
this.process = process; |
|
70 |
} |
|
71 |
} |
modules/dnet-graph-domain/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupMinDistGraphJob.xml | ||
---|---|---|
33 | 33 |
<PROPERTY key="mapreduce.map.speculative" value="false"/> |
34 | 34 |
<PROPERTY key="mapreduce.reduce.speculative" value="false"/> |
35 | 35 |
|
36 |
<PROPERTY key="mapred.reduce.tasks" value="1"/> |
|
36 |
<PROPERTY key="mapred.reduce.tasks" value="100"/>
|
|
37 | 37 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
38 | 38 |
|
39 | 39 |
<!-- Uncomment to override the default lib path --> |
modules/dnet-msro-service/trunk/src/main/java/eu/dnetlib/msro/workflows/nodes/mdstore/StoreMDStoreRecordsJobNode.java | ||
---|---|---|
39 | 39 |
final ResultSet<?> rs = token.getEnv().getAttribute(getEprParam(), ResultSet.class); |
40 | 40 |
|
41 | 41 |
token.setProgressProvider(new ResultsetProgressProvider(rs, this.resultSetClient)); |
42 |
final String collectionMode = token.getEnv().getAttribute("collectionMode", String.class); |
|
42 | 43 |
|
43 | 44 |
job.setAction("FEED"); |
44 | 45 |
job.getParameters().put("epr", rs.toJson()); |
45 |
job.getParameters().put("storingType", getStoringType());
|
|
46 |
job.getParameters().put("storingType",collectionMode!=null?collectionMode:getStoringType());
|
|
46 | 47 |
job.getParameters().put("mdId", getMdId()); |
47 | 48 |
} |
48 | 49 |
|
modules/dnet-dli/trunk/src/test/java/eu/dnetlib/dli/transform/DLITransformTest.java | ||
---|---|---|
104 | 104 |
|
105 | 105 |
} |
106 | 106 |
|
107 |
|
|
107 | 108 |
@Test |
109 |
public void testENATransform() throws Exception { |
|
110 |
|
|
111 |
final InputStream xsltAsStream = |
|
112 |
getClass().getResourceAsStream("/eu/dnetlib/dli/transform/ena.xslt"); |
|
113 |
final TransformerFactory factory = TransformerFactory.newInstance(); |
|
114 |
factory.newTransformer(); |
|
115 |
final StreamSource xsltSource = new StreamSource(xsltAsStream); |
|
116 |
final InputStream recordStream = this.getClass().getResourceAsStream("/eu/dnetlib/dli/transform/input_ena.xml"); |
|
117 |
final Transformer transformer = factory.newTransformer(xsltSource); |
|
118 |
transformer.setOutputProperty(OutputKeys.INDENT, "yes"); |
|
119 |
final StringWriter output = new StringWriter(); |
|
120 |
transformer.transform(new StreamSource(recordStream), new StreamResult(output)); |
|
121 |
System.out.println(output.toString()); |
|
122 |
|
|
123 |
} |
|
124 |
|
|
125 |
@Test |
|
108 | 126 |
public void testIEEETransform() throws Exception { |
109 | 127 |
|
110 | 128 |
final InputStream xsltAsStream = |
modules/dnet-dli/trunk/src/test/resources/eu/dnetlib/dli/transform/input_ena.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/" |
|
3 |
xmlns:dri="http://www.driver-repository.eu/namespace/dri" |
|
4 |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> |
|
5 |
<oai:header> |
|
6 |
<dri:objIdentifier xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
7 |
xmlns:datacite="http://datacite.org/schema/kernel-3">r3d100010527::00000648e410cb638823564cf5fdc885</dri:objIdentifier> |
|
8 |
<dri:recordIdentifier xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
9 |
xmlns:datacite="http://datacite.org/schema/kernel-3">http://www.ebi.ac.uk/ena/data/search/?query=11118631&result=sequence_release::url</dri:recordIdentifier> |
|
10 |
<dri:dateOfCollection xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
11 |
xmlns:datacite="http://datacite.org/schema/kernel-3">2017-09-18T14:07:46.84+02:00</dri:dateOfCollection> |
|
12 |
<dri:repositoryId xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
13 |
xmlns:datacite="http://datacite.org/schema/kernel-3">dli_________::r3d100010527</dri:repositoryId> |
|
14 |
<dri:datasourceprefix xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
15 |
xmlns:datacite="http://datacite.org/schema/kernel-3">r3d100010527</dri:datasourceprefix> |
|
16 |
</oai:header> |
|
17 |
<metadata xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
18 |
xmlns:datacite="http://datacite.org/schema/kernel-3" |
|
19 |
xmlns:dc="http://purl.org/dc/elements/1.1/"> |
|
20 |
<resource xmlns="http://datacite.org/schema/kernel-3" |
|
21 |
xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd"> |
|
22 |
<identifier xmlns="" identifierType="url">http://www.ebi.ac.uk/ena/data/search/?query=11118631&result=sequence_release</identifier> |
|
23 |
<titles xmlns=""> |
|
24 |
<title>Nucleotide sequences (Release) (1/2) of "Molecular analysis of a t(7;14)(q35;q32) chromosome translocation in a T cell leukemia of a patient with ataxia telangiectasia." </title> |
|
25 |
</titles> |
|
26 |
<dates xmlns=""> |
|
27 |
<date dateType="Collected">2000-04-21</date> |
|
28 |
</dates> |
|
29 |
<descriptions xmlns=""> |
|
30 |
<description>Catharanthus roseus 2C-methyl-D-erythritol 2,4-cyclodiphosphate synthase (MECS) mRNA, complete cds.</description> |
|
31 |
</descriptions> |
|
32 |
<subjects xmlns=""> |
|
33 |
<subject subjectScheme="organism">Catharanthus roseus</subject> |
|
34 |
<subject subjectScheme="codon_start">1</subject> |
|
35 |
<subject subjectScheme="gene">MECS</subject> |
|
36 |
<subject subjectScheme="product">2C-methyl-D-erythritol 2,4-cyclodiphosphate synthase</subject> |
|
37 |
<subject subjectScheme="note">YgbB protein</subject> |
|
38 |
<subject subjectScheme="protein_id">AAF65155.1</subject> |
|
39 |
<subject subjectScheme="translation">MAMATSFYCSTAIPSKKTNQNRENFLCSPVGGSKTTPSYIRLSTRQSRTLSLVVSAAASGAAVEAEPKFAAVTPSKILSF |
|
40 |
RVGHGFDLHRLEPGYPLIIGGINIPHDRGCEAHSDGDVLLHCVVDAILGALGLPDIGQIFPDTDPKWKGAPSSVFIKEAV |
|
41 |
RLMDEAGYELGNLDATLILQRPKVSPHKEAIRQNLCQLLGADPCVVNLKAKTHEKVDSLGENRSIAAHTVVLLMRK</subject> |
|
42 |
|
|
43 |
</subjects> |
|
44 |
<resourceType xmlns="" resourceTypeGeneral="dataset">dataset</resourceType> |
|
45 |
<oaf:relatedIdentifier xmlns="" entityType="publication" inverseRelationType="isRelatedTo" |
|
46 |
relatedIdentifierType="dnet" |
|
47 |
relationType="isRelatedTo">r3d100010527::f89cc26f592efddee8437a6a73a6ab5a</oaf:relatedIdentifier> |
|
48 |
</resource> |
|
49 |
</metadata> |
|
50 |
<oai:about> |
|
51 |
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" |
|
52 |
xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd"> |
|
53 |
<originDescription xmlns="" altered="true" harvestDate="2018-01-24T16:52:33.227+01:00"> |
|
54 |
<baseURL>%2Fsrv%2Fmongo%2Fdli_import%2Fena.gz</baseURL> |
|
55 |
<identifier/> |
|
56 |
<datestamp/> |
|
57 |
<metadataNamespace/> |
|
58 |
</originDescription> |
|
59 |
</provenance> |
|
60 |
</oai:about> |
|
61 |
</oai:record> |
modules/dnet-msro-service/trunk/src/main/resources/eu/dnetlib/msro/service/applicationContext-msro-nodes.xml | ||
---|---|---|
102 | 102 |
class="eu.dnetlib.msro.workflows.nodes.transform.TransformJobNode" |
103 | 103 |
scope="prototype"/> |
104 | 104 |
|
105 |
<bean id="wfNodeIncrementalTransformation" |
|
106 |
class="eu.dnetlib.msro.workflows.nodes.transform.IncrementalTransformationJobNode" |
|
107 |
scope="prototype"/> |
|
108 |
|
|
109 |
|
|
105 | 110 |
<bean id="wfNodeApplyXslt" |
106 | 111 |
class="eu.dnetlib.msro.workflows.nodes.transform.ApplyXsltJobNode" |
107 | 112 |
scope="prototype"/> |
modules/dnet-dli/trunk/src/test/resources/eu/dnetlib/dli/transform/ena.xslt | ||
---|---|---|
1 |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" |
|
2 |
xmlns:dri="http://www.driver-repository.eu/namespace/dri" |
|
3 |
xmlns:dc="http://purl.org/dc/elements/1.1/" |
|
4 |
xmlns:oaf="http://namespace.dnet.eu/oaf" version="2.0" exclude-result-prefixes="xsl"> |
|
5 |
<xsl:template match="/"> |
|
6 |
<xsl:apply-templates mode="dli"></xsl:apply-templates> |
|
7 |
</xsl:template> |
|
8 |
|
|
9 |
<xsl:template match="@*|node()" mode="dli"> |
|
10 |
<xsl:copy> |
|
11 |
<xsl:apply-templates select="@*|node()" mode="dli"/> |
|
12 |
</xsl:copy> |
|
13 |
</xsl:template> |
|
14 |
|
|
15 |
<xsl:template match="*[local-name()='about']" mode="dli"> |
|
16 |
<oaf:about> |
|
17 |
<oaf:datainfo> |
|
18 |
<oaf:completionStatus>complete</oaf:completionStatus> |
|
19 |
<xsl:choose> |
|
20 |
<xsl:when test="//dc:type='publication'"> |
|
21 |
<oaf:collectedFrom completionStatus="incomplete" id="r3d100010527" name="European Nucleotide Archive" /> |
|
22 |
<oaf:resolvedFrom completionStatus="complete" id="dli_________::pubmed" name="PubMed"/> |
|
23 |
<oaf:provisionMode>resolved</oaf:provisionMode> |
|
24 |
</xsl:when> |
|
25 |
<xsl:otherwise> |
|
26 |
<oaf:collectedFrom completionStatus="complete" id="r3d100010527" name="European Nucleotide Archive" /> |
|
27 |
<oaf:provisionMode>collected</oaf:provisionMode> |
|
28 |
</xsl:otherwise> |
|
29 |
</xsl:choose> |
|
30 |
</oaf:datainfo> |
|
31 |
</oaf:about> |
|
32 |
</xsl:template> |
|
33 |
|
|
34 |
</xsl:stylesheet> |
modules/dnet-dli/trunk/src/main/java/eu/dnetlib/dli/resolver/CrossrefResolver.java | ||
---|---|---|
17 | 17 |
import org.bson.Document; |
18 | 18 |
import org.bson.conversions.Bson; |
19 | 19 |
import org.springframework.beans.factory.annotation.Autowired; |
20 |
import org.springframework.beans.factory.annotation.Required; |
|
20 | 21 |
|
21 | 22 |
// TODO: Auto-generated Javadoc |
22 | 23 |
|
... | ... | |
42 | 43 |
@Autowired |
43 | 44 |
private CrossRefParserJSON parser; |
44 | 45 |
|
46 |
@Autowired |
|
47 |
private MongoClient mongoClient; |
|
45 | 48 |
|
49 |
private MongoDatabase db; |
|
50 |
|
|
51 |
private String dumpType; |
|
52 |
|
|
53 |
|
|
46 | 54 |
@Override |
47 | 55 |
protected boolean canResolvePid(final String pidType) { |
48 | 56 |
return (pidType != null) && ("doi".equals(pidType.toLowerCase().trim()) || "handle".equals(pidType.toLowerCase().trim())); |
... | ... | |
68 | 76 |
if (record.getPid() == null) return null; |
69 | 77 |
return record; |
70 | 78 |
} catch (Throwable e) { |
71 |
log.error("Error on resolve pid " + pid, e); |
|
79 |
log.error("¯\\_(ツ)_/¯ Error on resolve pid " + pid, e);
|
|
72 | 80 |
} |
73 | 81 |
return null; |
74 | 82 |
} |
75 | 83 |
|
76 | 84 |
|
77 | 85 |
private String retrieveCrossRefFromDump(final String pid) { |
86 |
if (dumpType.equalsIgnoreCase("mongo")){ |
|
87 |
return retrieveCrossRefFromDumpMongo(pid); |
|
88 |
} |
|
89 |
else if (dumpType.equalsIgnoreCase("ES")){ |
|
90 |
return retrieveCrossRefFromDumpES(pid); |
|
91 |
} |
|
92 |
throw new RuntimeException("incorrect dump Type expected [mongo, ES] found: "+dumpType ); |
|
78 | 93 |
|
94 |
} |
|
79 | 95 |
|
80 |
final String response = requestURL("http://ip-90-147-167-137.ct1.garrservices.it:9200/crossref/item/" + pid.replaceAll("/","%2F")); |
|
81 |
|
|
82 |
return response; |
|
96 |
private String retrieveCrossRefFromDumpES(final String pid) { |
|
97 |
return requestURL("http://ip-90-147-167-137.ct1.garrservices.it:9200/crossref/item/" + pid.replaceAll("/","%2F")); |
|
83 | 98 |
} |
84 | 99 |
|
85 | 100 |
|
101 |
private String retrieveCrossRefFromDumpMongo(final String pid) { |
|
102 |
if (db == null) { |
|
103 |
db = mongoClient.getDatabase("crossRef"); |
|
86 | 104 |
|
105 |
} |
|
106 |
final MongoCollection<Document> crossRef = db.getCollection("dump"); |
|
87 | 107 |
|
108 |
DBObject query = QueryBuilder.start("_id").is(pid).get(); |
|
109 |
FindIterable<Document> documents = crossRef.find((Bson) query).limit(1); |
|
110 |
MongoCursor<Document> iterator = documents.iterator(); |
|
111 |
if (iterator.hasNext()){ |
|
112 |
return iterator.next().toJson(); |
|
113 |
} |
|
114 |
return null; |
|
115 |
} |
|
88 | 116 |
|
117 |
public String getDumpType() { |
|
118 |
return dumpType; |
|
119 |
} |
|
89 | 120 |
|
90 |
|
|
121 |
@Required |
|
122 |
public void setDumpType(String dumpType) { |
|
123 |
this.dumpType = dumpType; |
|
124 |
} |
|
91 | 125 |
} |
modules/dnet-dli/trunk/src/main/java/eu/dnetlib/dli/resolver/CrossRefParserJSON.java | ||
---|---|---|
38 | 38 |
if (record == null) return null; |
39 | 39 |
JsonElement jElement = new JsonParser().parse(record); |
40 | 40 |
|
41 |
JsonElement source = jElement.getAsJsonObject().get("_source"); |
|
42 |
if (source== null || !source.isJsonObject()) |
|
41 |
JsonElement source = null; |
|
42 |
|
|
43 |
if (jElement.getAsJsonObject().has("_source")) { |
|
44 |
source = jElement.getAsJsonObject().get("_source"); |
|
45 |
if (source == null || !source.isJsonObject()) |
|
46 |
return null; |
|
47 |
} |
|
48 |
else if(jElement.getAsJsonObject().has("DOI")){ |
|
49 |
source = jElement; |
|
50 |
} else { |
|
43 | 51 |
return null; |
52 |
} |
|
44 | 53 |
|
45 | 54 |
final JsonObject message = source.getAsJsonObject(); |
46 | 55 |
DLIResolvedObject currentObject = new DLIResolvedObject(); |
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/dli/resolver/applicationContext-dli-resolver.properties | ||
---|---|---|
1 | 1 |
services.dli.resolver.store.DatabaseName=resolverStore |
2 |
services.dli.resolver.store.CollectionName=resolverCollection |
|
2 |
services.dli.resolver.store.CollectionName=resolverCollection |
|
3 |
services.dli.resolver.crossRef.dump=mongo |
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/dli/resolver/applicationContext-dli-resolver.xml | ||
---|---|---|
34 | 34 |
<property name="cache" ref="dliResolverCache"/> |
35 | 35 |
</bean> |
36 | 36 |
|
37 |
<bean id="crossrefResolver" class="eu.dnetlib.dli.resolver.CrossrefResolver" p:order="0" p:availableOffline="true">
|
|
38 |
<property name="cache" ref="dliResolverCache"/>
|
|
37 |
<bean id="crossrefResolver" class="eu.dnetlib.dli.resolver.CrossrefResolver" p:order="0" p:availableOffline="true" |
|
38 |
p:dumpType="${services.dli.resolver.crossRef.dump}" p:cache-ref="dliResolverCache">
|
|
39 | 39 |
</bean> |
40 | 40 |
|
41 | 41 |
<bean id="dliOfflineResolver" class="eu.dnetlib.dli.resolver.DLIOfflineResolver" p:order="0" p:availableOffline="true"> |
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/dli/workflows/repo-hi/dli_aggregation_wf.xml.st | ||
---|---|---|
23 | 23 |
<PARAM name="unknownMdstoreId" description="Store for unknown records" required="true" managedBy="system" category="MDSTORE_ID"/> |
24 | 24 |
<PARAM name="cleanTransformationRuleId" description="Transformation Rule Identifier" required="true" managedBy="user" category="TRANSFORMATION_RULE_ID" type="string" function="listProfiles('TransformationRuleDSResourceType', '//TITLE', 'DLI:')"/> |
25 | 25 |
<PARAM description="Type of Transformation" function="validValues(['simpleTransform', 'transformAndUnpack'])" managedBy="user" name="typeOfTransform" required="true" type="string">simpleTransform</PARAM> |
26 |
<PARAM description="Incremental Transformation" function="validValues(['INCREMENTAL', 'REFRESH'])" managedBy="user" name="collectionMode" required="false" type="string">REFRESH</PARAM> |
|
26 | 27 |
<PARAM description="Resolving offline" function="validValues(['false', 'true'])" managedBy="user" name="offlineResolving" required="true" type="string">false</PARAM> |
28 |
<PARAM name="from_date" description="Start Date of Harvesting" required="false" managedBy="user" category="COLLECTION" type="string"/> |
|
27 | 29 |
</PARAMETERS> |
28 | 30 |
<WORKFLOW> |
29 | 31 |
<NODE isStart="true" name="collection" type="LaunchWorkflowTemplate"> |
... | ... | |
35 | 37 |
<ENTRY key="dsId" value="$dsId$" /> |
36 | 38 |
<ENTRY key="interface" value="$interface$" /> |
37 | 39 |
<ENTRY key="collMdstoreId" ref="collMdstoreId" /> |
40 |
<ENTRY key="collectionMode" ref="collectionMode" /> |
|
41 |
<ENTRY key="from_date" ref="from_date" /> |
|
38 | 42 |
</MAP> |
39 | 43 |
</PARAM> |
40 | 44 |
</PARAMETERS> |
... | ... | |
53 | 57 |
<ENTRY key="collMdstoreId" ref="collMdstoreId" /> |
54 | 58 |
<ENTRY key="cleanMdstoreId" ref="cleanMdstoreId" /> |
55 | 59 |
<ENTRY key="cleanRuleId" ref="cleanTransformationRuleId" /> |
56 |
<ENTRY key="typeOfTransform" ref="typeOfTransform" /> |
|
60 |
<ENTRY key="collectionMode" ref="collectionMode" /> |
|
61 |
<ENTRY key="typeOfTransform" ref="typeOfTransform" /> |
|
57 | 62 |
|
58 | 63 |
</MAP> |
59 | 64 |
</PARAM> |
... | ... | |
73 | 78 |
<ENTRY key="collMdstoreId" ref="collMdstoreId" /> |
74 | 79 |
<ENTRY key="cleanMdstoreId" ref="cleanMdstoreId" /> |
75 | 80 |
<ENTRY key="offlineResolving" ref="offlineResolving" /> |
81 |
<ENTRY key="collectionMode" ref="collectionMode" /> |
|
76 | 82 |
</MAP> |
77 | 83 |
</PARAM> |
78 | 84 |
</PARAMETERS> |
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/TransformationRuleDSResources/ena_transform.xml | ||
---|---|---|
12 | 12 |
<IMPORTED/> |
13 | 13 |
<SCRIPT> |
14 | 14 |
<TITLE>DLI: ENA transform</TITLE> |
15 |
<CODE><![CDATA[<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> |
|
16 |
<xsl:template match="@*|node()"> |
|
15 |
<CODE><![CDATA[<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" |
|
16 |
xmlns:dri="http://www.driver-repository.eu/namespace/dri" |
|
17 |
xmlns:dc="http://purl.org/dc/elements/1.1/" |
|
18 |
xmlns:oaf="http://namespace.dnet.eu/oaf" version="2.0" exclude-result-prefixes="xsl"> |
|
19 |
<xsl:template match="/"> |
|
20 |
<xsl:apply-templates mode="dli"></xsl:apply-templates> |
|
21 |
</xsl:template> |
|
22 |
|
|
23 |
<xsl:template match="@*|node()" mode="dli"> |
|
17 | 24 |
<xsl:copy> |
18 |
<xsl:apply-templates select="@*|node()"/> |
|
19 |
</xsl:copy> |
|
25 |
<xsl:apply-templates select="@*|node()" mode="dli"/>
|
|
26 |
</xsl:copy>elasticsearchFeedDliSummaryDataJob
|
|
20 | 27 |
</xsl:template> |
28 |
|
|
29 |
<xsl:template match="*[local-name()='about']" mode="dli"> |
|
30 |
<oaf:about> |
|
31 |
<oaf:datainfo> |
|
32 |
<oaf:completionStatus>complete</oaf:completionStatus> |
|
33 |
<xsl:choose> |
|
34 |
<xsl:when test="//dc:type='publication'"> |
|
35 |
<oaf:collectedFrom completionStatus="incomplete" id="r3d100010527" name="European Nucleotide Archive" /> |
|
36 |
<oaf:resolvedFrom completionStatus="complete" id="dli_________::pubmed" name="PubMed"/> |
|
37 |
<oaf:provisionMode>resolved</oaf:provisionMode> |
|
38 |
</xsl:when> |
|
39 |
<xsl:otherwise> |
|
40 |
<oaf:collectedFrom completionStatus="complete" id="r3d100010527" name="European Nucleotide Archive" /> |
|
41 |
<oaf:provisionMode>collected</oaf:provisionMode> |
|
42 |
</xsl:otherwise> |
|
43 |
</xsl:choose> |
|
44 |
</oaf:datainfo> |
|
45 |
</oaf:about> |
|
46 |
</xsl:template> |
|
47 |
|
|
21 | 48 |
</xsl:stylesheet> |
22 | 49 |
]]></CODE> |
23 | 50 |
</SCRIPT> |
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/RepositoryServiceResources/ccdc.xml | ||
---|---|---|
37 | 37 |
<INTERFACE active="true" compliance="dli" contentDescription="metadata" id="api_________::opendoar____::2367::0" |
38 | 38 |
label="Links provider" typology="links::provider" removable="false"> |
39 | 39 |
<ACCESS_PROTOCOL format="oai_datacite" set="BL.CCDC">oai</ACCESS_PROTOCOL> |
40 |
<BASE_URL>http://oai.datacite.org/oai</BASE_URL> |
|
40 |
<BASE_URL>https://oai.datacite.org/oai</BASE_URL>
|
|
41 | 41 |
<INTERFACE_EXTRA_FIELD name="last_collection_date"/> |
42 | 42 |
<INTERFACE_EXTRA_FIELD name="metadata_identifier_path">//*[local-name()='header']/*[local-name()='identifier'] |
43 | 43 |
</INTERFACE_EXTRA_FIELD> |
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/RepositoryServiceResources/datacite.xml | ||
---|---|---|
36 | 36 |
<INTERFACES> |
37 | 37 |
<INTERFACE active="true" compliance="dli" contentDescription="metadata" id="api_________::datacite::0" |
38 | 38 |
label="Links provider" typology="links::provider" removable="false"> |
39 |
<ACCESS_PROTOCOL>mongoDump</ACCESS_PROTOCOL>
|
|
40 |
<BASE_URL>/home/sandro/mongo_backup/datacite3000.json</BASE_URL>
|
|
39 |
<ACCESS_PROTOCOL format="oai_datacite">oai</ACCESS_PROTOCOL>
|
|
40 |
<BASE_URL>https://oai.datacite.org/oai</BASE_URL>
|
|
41 | 41 |
<INTERFACE_EXTRA_FIELD name="last_collection_date"/> |
42 | 42 |
<INTERFACE_EXTRA_FIELD name="metadata_identifier_path">//*[local-name()='header']/*[local-name()='identifier'] |
43 | 43 |
</INTERFACE_EXTRA_FIELD> |
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/workflows/linkprovider/link_provider_resolver_template.xml | ||
---|---|---|
16 | 16 |
<PARAM name="cleanMdstoreId" description="Store for cleaned records" required="true" type="string"/> |
17 | 17 |
<PARAM name="numberOfThreads" description="number of threads for resolve PID" required="true" type="string" default="4"/> |
18 | 18 |
<PARAM name="offlineResolving" description="decide if you want resolve offline using only the resolved item" required="true" type="boolean" default="false"/> |
19 |
<PARAM name="collectionMode" description="Define the type of collection mode [INCREMENTAL, REFRESH]" required="true" default="REFRESH" type="string"/> |
|
19 | 20 |
</PARAMETERS> |
20 | 21 |
<WORKFLOW> |
21 | 22 |
|
... | ... | |
26 | 27 |
<PARAM name="pluginName" value="dliResolverPlugin"/> |
27 | 28 |
<PARAM name="numberOfThreads" ref="numberOfThreads"/> |
28 | 29 |
<PARAM name="offline" ref="offlineResolving"/> |
30 |
<PARAM name="collectionMode" ref="collectionMode"/> |
|
29 | 31 |
</PARAMETERS> |
30 | 32 |
<ARCS> |
31 | 33 |
<ARC to="UPDATE_INFO"/> |
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/workflows/linkprovider/link_provider_collection_template.xml | ||
---|---|---|
13 | 13 |
<PARAM name="dsId" description="Datasource Id" required="true" type="string"/> |
14 | 14 |
<PARAM name="interface" description="Datasource Interface" required="true" type="string"/> |
15 | 15 |
<PARAM name="collMdstoreId" description="Store for collected records" required="true" type="string"/> |
16 |
<PARAM name="from_date" description="From Date" required="false" type="string"/> |
|
17 |
<PARAM name="collectionMode" description="Define the type of collection mode [INCREMENTAL, REFRESH]" required="true" default="REFRESH" type="string"/> |
|
16 | 18 |
</PARAMETERS> |
17 | 19 |
<WORKFLOW> |
18 |
<NODE name="COLLECT_REFRESH" type="CollectRecords" isStart="true"> |
|
20 |
<NODE name="FindDateRangeForIncrementalHarvesting" type="FindDateRangeForIncrementalHarvesting" |
|
21 |
isStart="true"> |
|
22 |
<DESCRIPTION>Find Last execution</DESCRIPTION> |
|
23 |
<PARAMETERS> |
|
24 |
<PARAM name="fromDateParam" ref="fromDateParam"/> |
|
25 |
<PARAM name="collectionMode" ref="collectionMode"/> |
|
26 |
</PARAMETERS> |
|
27 |
<ARCS> |
|
28 |
<ARC to="COLLECT"/> |
|
29 |
</ARCS> |
|
30 |
</NODE> |
|
31 |
<NODE name="COLLECT" type="DateRangeCollectRecords"> |
|
19 | 32 |
<DESCRIPTION>Start Harvesting</DESCRIPTION> |
20 | 33 |
<PARAMETERS> |
21 | 34 |
<PARAM name="datasourceId" ref="dsId"/> |
22 | 35 |
<PARAM name="datasourceInterface" ref="interface"/> |
23 | 36 |
<PARAM name="eprParam" value="collected_epr"/> |
37 |
<PARAM name="from" ref="from_date"/> |
|
38 |
<PARAM name="fromDateParam" ref="fromDateParam"/> |
|
24 | 39 |
</PARAMETERS> |
25 | 40 |
<ARCS> |
26 | 41 |
<ARC to="MD_BUILDER"/> |
... | ... | |
35 | 50 |
<PARAM name="datasourceInterface" ref="interface"/> |
36 | 51 |
</PARAMETERS> |
37 | 52 |
<ARCS> |
38 |
<ARC to="STORE_REFRESH"/>
|
|
53 |
<ARC to="STORE"/> |
|
39 | 54 |
</ARCS> |
40 | 55 |
</NODE> |
41 |
<NODE name="STORE_REFRESH" type="StoreMDStoreRecords">
|
|
56 |
<NODE name="STORE" type="StoreMDStoreRecords"> |
|
42 | 57 |
<DESCRIPTION>Store mdstore records</DESCRIPTION> |
43 | 58 |
<PARAMETERS> |
44 | 59 |
<PARAM name="mdId" ref="collMdstoreId"/> |
45 |
<PARAM name="storingType" value="REFRESH"/>
|
|
60 |
<PARAM name="storingType" ref="collectionMode"/>
|
|
46 | 61 |
<PARAM name="eprParam" value="store_epr"/> |
47 | 62 |
</PARAMETERS> |
48 | 63 |
<ARCS> |
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/workflows/linkprovider/link_provider_transform_template.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 | 1 |
<RESOURCE_PROFILE> |
3 | 2 |
<HEADER> |
4 | 3 |
<RESOURCE_IDENTIFIER value="da3531c6-2bf6-48ab-848b-bd4c6379fd65_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/> |
5 | 4 |
<RESOURCE_TYPE value="WorkflowTemplateDSResourceType"/> |
6 | 5 |
<RESOURCE_KIND value="WorkflowTemplateDSResources"/> |
7 | 6 |
<RESOURCE_URI value=""/> |
8 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/>
|
|
7 |
<DATE_OF_CREATION value="2018-02-12T12:49:19+01:00"/>
|
|
9 | 8 |
</HEADER> |
10 | 9 |
<BODY> |
11 | 10 |
<CONFIGURATION> |
12 | 11 |
<PARAMETERS> |
13 |
<PARAM name="dsId" description="Datasource Id" required="true" type="string"/> |
|
14 |
<PARAM name="interface" description="Datasource Interface" required="true" type="string"/> |
|
15 |
<PARAM name="collMdstoreId" description="Store for collected records" required="true" type="string"/> |
|
16 |
<PARAM name="cleanMdstoreId" description="Store for cleaned records" required="true" type="string"/> |
|
17 |
<PARAM name="cleanRuleId" description="Transformation Rule Id" required="true" type="string"/> |
|
18 |
<PARAM name="typeOfTransform" description="Type of transformation" required="true" type="string"/> |
|
12 |
<PARAM description="Datasource Id" name="dsId" required="true" type="string"/> |
|
13 |
<PARAM description="Datasource Interface" name="interface" required="true" type="string"/> |
|
14 |
<PARAM description="Store for collected records" name="collMdstoreId" required="true" type="string"/> |
|
15 |
<PARAM description="Store for cleaned records" name="cleanMdstoreId" required="true" type="string"/> |
|
16 |
<PARAM description="Transformation Rule Id" name="cleanRuleId" required="true" type="string"/> |
|
17 |
<PARAM description="Type of transformation" name="typeOfTransform" required="true" type="string"/> |
|
18 |
<PARAM name="collectionMode" description="Define the type of collection mode [INCREMENTAL, REFRESH]" required="true" default="REFRESH" type="string"/> |
|
19 | 19 |
</PARAMETERS> |
20 | 20 |
<WORKFLOW> |
21 |
<NODE name="fetchOriginals" type="FetchMDStoreRecords" isStart="true"> |
|
21 |
<NODE isStart="true" name="setIncrementalTransformation" type="IncrementalTransformation"> |
|
22 |
<DESCRIPTION>Set incremental transformation</DESCRIPTION> |
|
23 |
<PARAMETERS> |
|
24 |
<PARAM name="transformationType" ref="collectionMode"/> |
|
25 |
</PARAMETERS> |
|
26 |
<ARCS> |
|
27 |
<ARC to="fetchOriginals"/> |
|
28 |
</ARCS> |
|
29 |
</NODE> |
|
30 |
<NODE name="fetchOriginals" type="FetchMDStoreRecords"> |
|
22 | 31 |
<DESCRIPTION>Fetch records from MDStore</DESCRIPTION> |
23 | 32 |
<PARAMETERS> |
24 | 33 |
<PARAM name="mdId" ref="collMdstoreId"/> |
25 | 34 |
<PARAM name="eprParam" value="orig_epr"/> |
26 | 35 |
</PARAMETERS> |
27 | 36 |
<ARCS> |
28 |
<ARC to="transformType"/>
|
|
37 |
<ARC to="transformType"/>
|
|
29 | 38 |
</ARCS> |
30 | 39 |
</NODE> |
31 |
|
|
32 |
<NODE name="transformType" type="Selection"> |
|
33 |
<DESCRIPTION>Clean original records</DESCRIPTION> |
|
34 |
<PARAMETERS> |
|
35 |
<PARAM name="selection" ref="typeOfTransform"/> |
|
36 |
</PARAMETERS> |
|
37 |
<ARCS> |
|
38 |
<ARC name="simpleTransform" to="simpleTransform"/> |
|
39 |
<ARC name="transformAndUnpack" to="transformAndUnpack"/> |
|
40 |
</ARCS> |
|
41 |
</NODE> |
|
42 |
|
|
43 |
|
|
44 |
<NODE name="simpleTransform" type="Transform"> |
|
40 |
<NODE name="transformType" type="Selection"> |
|
45 | 41 |
<DESCRIPTION>Clean original records</DESCRIPTION> |
46 | 42 |
<PARAMETERS> |
43 |
<PARAM name="selection" ref="typeOfTransform"/> |
|
44 |
</PARAMETERS> |
|
45 |
<ARCS> |
|
46 |
<ARC name="simpleTransform" to="simpleTransform"/> |
|
47 |
<ARC name="transformAndUnpack" to="transformAndUnpack"/> |
|
48 |
</ARCS> |
|
49 |
</NODE> |
|
50 |
<NODE name="simpleTransform" type="Transform"> |
|
51 |
<DESCRIPTION>Clean original records</DESCRIPTION> |
|
52 |
<PARAMETERS> |
|
47 | 53 |
<PARAM name="ruleId" ref="cleanRuleId"/> |
48 | 54 |
<PARAM name="inputEprParam" value="orig_epr"/> |
49 | 55 |
<PARAM name="outputEprParam" value="clean_epr"/> |
50 | 56 |
</PARAMETERS> |
51 | 57 |
<ARCS> |
52 |
<ARC to="storeSimpleRecords"/>
|
|
58 |
<ARC to="storeSimpleRecords"/>
|
|
53 | 59 |
</ARCS> |
54 | 60 |
</NODE> |
55 |
|
|
56 |
|
|
57 |
<NODE name="storeSimpleRecords" type="StoreMDStoreRecords"> |
|
58 |
<DESCRIPTION>Store mdtore records</DESCRIPTION> |
|
59 |
<PARAMETERS> |
|
60 |
<PARAM name="mdId" ref="cleanMdstoreId"/> |
|
61 |
<PARAM name="storingType" value="REFRESH"/> |
|
62 |
<PARAM name="eprParam" value="clean_epr"/> |
|
63 |
</PARAMETERS> |
|
64 |
<ARCS> |
|
65 |
<ARC to="UPDATE_INFO"/> |
|
66 |
</ARCS> |
|
67 |
</NODE> |
|
68 |
|
|
69 |
|
|
70 |
<NODE name="transformAndUnpack" type="Transform"> |
|
71 |
<DESCRIPTION>Clean original records</DESCRIPTION> |
|
72 |
<PARAMETERS> |
|
73 |
<PARAM name="ruleId" ref="cleanRuleId"/> |
|
74 |
<PARAM name="inputEprParam" value="orig_epr"/> |
|
75 |
<PARAM name="outputEprParam" value="pack_epr"/> |
|
76 |
</PARAMETERS> |
|
77 |
<ARCS> |
|
78 |
<ARC to="unpackRecord"/> |
|
79 |
</ARCS> |
|
80 |
</NODE> |
|
81 |
|
|
82 |
<NODE name="unpackRecord" type="Unpack"> |
|
83 |
<DESCRIPTION>Unpack transformed records</DESCRIPTION> |
|
84 |
<PARAMETERS> |
|
85 |
<PARAM name="xpath" value="//*[local-name()='record']"/> |
|
86 |
<PARAM name="inputEprParam" value="pack_epr"/> |
|
87 |
<PARAM name="outputEprParam" value="clean_epr"/> |
|
88 |
</PARAMETERS> |
|
89 |
<ARCS> |
|
90 |
<ARC to="storeUnpackedRecords"/> |
|
91 |
</ARCS> |
|
92 |
</NODE> |
|
93 |
|
|
94 |
<NODE name="storeUnpackedRecords" type="StoreMDStoreRecords"> |
|
61 |
<NODE name="storeSimpleRecords" type="StoreMDStoreRecords"> |
|
95 | 62 |
<DESCRIPTION>Store mdtore records</DESCRIPTION> |
96 | 63 |
<PARAMETERS> |
97 | 64 |
<PARAM name="mdId" ref="cleanMdstoreId"/> |
... | ... | |
102 | 69 |
<ARC to="UPDATE_INFO"/> |
103 | 70 |
</ARCS> |
104 | 71 |
</NODE> |
105 |
|
|
72 |
<NODE name="transformAndUnpack" type="Transform"> |
|
73 |
<DESCRIPTION>Clean original records</DESCRIPTION> |
|
74 |
<PARAMETERS> |
|
75 |
<PARAM name="ruleId" ref="cleanRuleId"/> |
|
76 |
<PARAM name="inputEprParam" value="orig_epr"/> |
|
77 |
<PARAM name="outputEprParam" value="pack_epr"/> |
|
78 |
</PARAMETERS> |
|
79 |
<ARCS> |
|
80 |
<ARC to="unpackRecord"/> |
|
81 |
</ARCS> |
|
82 |
</NODE> |
|
83 |
<NODE name="unpackRecord" type="Unpack"> |
|
84 |
<DESCRIPTION>Unpack transformed records</DESCRIPTION> |
|
85 |
<PARAMETERS> |
|
86 |
<PARAM name="xpath" value="//*[local-name()='record']"/> |
|
87 |
<PARAM name="inputEprParam" value="pack_epr"/> |
|
88 |
<PARAM name="outputEprParam" value="clean_epr"/> |
|
89 |
</PARAMETERS> |
|
90 |
<ARCS> |
|
91 |
<ARC to="storeUnpackedRecords"/> |
|
92 |
</ARCS> |
|
93 |
</NODE> |
|
94 |
<NODE name="storeUnpackedRecords" type="StoreMDStoreRecords"> |
|
95 |
<DESCRIPTION>Store mdtore records</DESCRIPTION> |
|
96 |
<PARAMETERS> |
|
97 |
<PARAM name="mdId" ref="cleanMdstoreId"/> |
|
98 |
<PARAM name="storingType" value="REFRESH"/> |
|
99 |
<PARAM name="eprParam" value="clean_epr"/> |
|
100 |
</PARAMETERS> |
|
101 |
<ARCS> |
|
102 |
<ARC to="UPDATE_INFO"/> |
|
103 |
</ARCS> |
|
104 |
</NODE> |
|
106 | 105 |
<NODE name="UPDATE_INFO" type="MDStoreToApiExtraField"> |
107 | 106 |
<DESCRIPTION>Update datasouce API extra fields</DESCRIPTION> |
108 | 107 |
<PARAMETERS> |
... | ... | |
120 | 119 |
</WORKFLOW> |
121 | 120 |
</CONFIGURATION> |
122 | 121 |
</BODY> |
123 |
</RESOURCE_PROFILE> |
|
122 |
</RESOURCE_PROFILE> |
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/transform/record_dli_dmf.xml | ||
---|---|---|
1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
2 |
<oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/" |
|
3 |
xmlns="http://namespace.openaire.eu/"> |
|
4 |
<oai:header xmlns=""> |
|
5 |
<dri:objIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri"> |
|
6 |
r3d100010464::a57f57564a6f58554f6dad6a954bbf55 |
|
7 |
</dri:objIdentifier> |
|
8 |
<dri:recordIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">1959.1/468446::hdl |
|
9 |
</dri:recordIdentifier> |
|
10 |
<dri:dateOfCollection xmlns:dri="http://www.driver-repository.eu/namespace/dri">2017-09-22T08:57:59.857+02:00 |
|
11 |
</dri:dateOfCollection> |
|
12 |
<dri:repositoryId xmlns:dri="http://www.driver-repository.eu/namespace/dri"> |
|
13 |
ands_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU= |
|
14 |
</dri:repositoryId> |
|
15 |
<dri:datasourceprefix xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464 |
|
16 |
</dri:datasourceprefix> |
|
2 |
<oai:record xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|
3 |
xmlns:oai="http://www.openarchives.org/OAI/2.0/" |
|
4 |
xmlns:dri="http://www.driver-repository.eu/namespace/dri"> |
|
5 |
<oai:header> |
|
6 |
<dri:objIdentifier xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
7 |
xmlns:datacite="http://datacite.org/schema/kernel-3">r3d100010527::00000648e410cb638823564cf5fdc885</dri:objIdentifier> |
|
8 |
<dri:recordIdentifier xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
9 |
xmlns:datacite="http://datacite.org/schema/kernel-3">http://www.ebi.ac.uk/ena/data/search/?query=11118631&result=sequence_release::url</dri:recordIdentifier> |
|
10 |
<dri:dateOfCollection xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
11 |
xmlns:datacite="http://datacite.org/schema/kernel-3">2017-09-18T14:07:46.84+02:00</dri:dateOfCollection> |
|
12 |
<dri:repositoryId xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
13 |
xmlns:datacite="http://datacite.org/schema/kernel-3">dli_________::r3d100010527</dri:repositoryId> |
|
14 |
<dri:datasourceprefix xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
15 |
xmlns:datacite="http://datacite.org/schema/kernel-3">r3d100010527</dri:datasourceprefix> |
|
17 | 16 |
</oai:header> |
18 |
<metadata xmlns=""> |
|
17 |
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" |
|
18 |
xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
19 |
xmlns:datacite="http://datacite.org/schema/kernel-3"> |
|
19 | 20 |
<resource xmlns="http://datacite.org/schema/kernel-3" |
20 |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|
21 | 21 |
xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd"> |
22 |
<identifier xmlns="" identifierType="hdl">1959.1/468446</identifier> |
|
23 |
<titles xmlns=""> |
|
24 |
<title>Clinician practices while using health information system security in Australian public |
|
25 |
hospitals: data |
|
26 |
</title> |
|
22 |
<identifier identifierType="url">http://www.ebi.ac.uk/ena/data/search/?query=11118631&result=sequence_release</identifier> |
|
23 |
<titles> |
|
24 |
<title>Nucleotide sequences (Release) (1/2) of "Molecular analysis of a t(7;14)(q35;q32) chromosome translocation in a T cell leukemia of a patient with ataxia telangiectasia." </title> |
|
27 | 25 |
</titles> |
28 |
<publisher xmlns="">Monash University</publisher> |
|
29 |
<dates xmlns=""> |
|
30 |
<date dateType="Collected">2012-11-28</date> |
|
26 |
<dates> |
|
27 |
<date dateType="Collected">2000-04-21</date> |
|
31 | 28 |
</dates> |
32 |
<creators xmlns=""> |
|
33 |
<creator> |
|
34 |
<creatorName>Dr Juanita Fernando</creatorName> |
|
35 |
</creator> |
|
36 |
</creators> |
|
37 |
<resourceType xmlns="" resourceTypeGeneral="Dataset">Dataset</resourceType> |
|
38 |
<!--<relatedIdentifiers xmlns="">--> |
|
39 |
<!--<relatedIdentifier entityType=" publication "--> |
|
40 |
<!--inverseRelationType="hasAssociationWith"--> |
|
41 |
<!--relatedIdentifierType="dnet"--> |
|
42 |
<!--relationType="hasAssociationWith">r3d100010255::d5ad02b122901f0d9d59f8348768d0f9--> |
|
43 |
<!--</relatedIdentifier>--> |
|
44 |
<!--</relatedIdentifiers>--> |
|
29 |
<descriptions> |
|
30 |
<description>Catharanthus roseus 2C-methyl-D-erythritol 2,4-cyclodiphosphate synthase (MECS) mRNA, complete cds.</description> |
|
31 |
</descriptions> |
|
32 |
<subjects> |
|
33 |
<subject subjectScheme="organism">Catharanthus roseus</subject> |
|
34 |
<subject subjectScheme="codon_start">1</subject> |
|
35 |
<subject subjectScheme="gene">MECS</subject> |
|
36 |
<subject subjectScheme="product">2C-methyl-D-erythritol 2,4-cyclodiphosphate synthase</subject> |
|
37 |
<subject subjectScheme="note">YgbB protein</subject> |
|
38 |
<subject subjectScheme="protein_id">AAF65155.1</subject> |
|
39 |
<subject subjectScheme="translation">MAMATSFYCSTAIPSKKTNQNRENFLCSPVGGSKTTPSYIRLSTRQSRTLSLVVSAAASGAAVEAEPKFAAVTPSKILSF |
|
40 |
RVGHGFDLHRLEPGYPLIIGGINIPHDRGCEAHSDGDVLLHCVVDAILGALGLPDIGQIFPDTDPKWKGAPSSVFIKEAV |
|
41 |
RLMDEAGYELGNLDATLILQRPKVSPHKEAIRQNLCQLLGADPCVVNLKAKTHEKVDSLGENRSIAAHTVVLLMRK</subject> |
|
42 |
|
|
43 |
</subjects> |
|
44 |
<resourceType xmlns="" resourceTypeGeneral="dataset">dataset</resourceType> |
|
45 |
<oaf:relatedIdentifier entityType="publication" inverseRelationType="isRelatedTo" |
|
46 |
relatedIdentifierType="dnet" |
|
47 |
relationType="isRelatedTo">r3d100010527::f89cc26f592efddee8437a6a73a6ab5a</oaf:relatedIdentifier> |
|
45 | 48 |
</resource> |
46 | 49 |
</metadata> |
47 |
<oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf" xmlns="">
|
|
50 |
<oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf"> |
|
48 | 51 |
<oaf:datainfo> |
49 |
<oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010464" |
|
50 |
name="Australian National Data Service"/> |
|
51 | 52 |
<oaf:completionStatus>complete</oaf:completionStatus> |
52 |
<oaf:provisionMode>collected</oaf:provisionMode> |
|
53 |
|
|
54 |
<oaf:collectedFrom completionStatus="complete" id="dli_________::datacite" |
|
55 |
name="Datasets in Datacite"/> |
|
56 |
|
|
53 | 57 |
</oaf:datainfo> |
54 | 58 |
</oaf:about> |
59 |
|
|
60 |
|
|
55 | 61 |
</oai:record> |
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/transform/record_dli_pmf.xml | ||
---|---|---|
1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
2 |
<record xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
3 |
xmlns:dri="http://www.driver-repository.eu/namespace/dri" |
|
4 |
xmlns:dc="http://purl.org/dc/elements/1.1/"> |
|
5 |
<oai:header xmlns:oai="http://www.openarchives.org/OAI/2.0/" |
|
6 |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> |
|
7 |
<dri:objIdentifier>pubmed______::000087d2ea077b8526bbc3c1436737ac</dri:objIdentifier> |
|
8 |
<dri:resolvedDate>2017-10-31T16:52:57.209</dri:resolvedDate> |
|
9 |
<dri:recordIdentifier>EuropePMC:26375944</dri:recordIdentifier> |
|
10 |
<dri:dateOfCollection>2017-10-31T16:51:58.293+01:00</dri:dateOfCollection> |
|
11 |
<dri:repositoryId>pbm_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId> |
|
12 |
<dri:datasourceprefix>pubmed______</dri:datasourceprefix> |
|
2 |
<oai:record xmlns:oaf="http://namespace.dnet.eu/oaf" |
|
3 |
xmlns:datacite="http://datacite.org/schema/kernel-3" |
|
4 |
xmlns:oai="http://www.openarchives.org/OAI/2.0/" |
|
5 |
xmlns:dri="http://www.driver-repository.eu/namespace/dri"> |
|
6 |
<oai:header> |
|
7 |
<dri:objIdentifier>dli_resolver::b65cd86a86f67af94a3a25452b8ab30d</dri:objIdentifier> |
|
8 |
<dri:recordIdentifier>b65cd86a86f67af94a3a25452b8ab30d</dri:recordIdentifier> |
|
9 |
<dri:dateOfCollection>2017-12-20T15:37:59.445+01:00</dri:dateOfCollection> |
|
10 |
<dri:repositoryId>dli_resolver</dri:repositoryId> |
|
11 |
<dri:datasourceprefix>dli_resolver</dri:datasourceprefix> |
|
13 | 12 |
</oai:header> |
14 |
<metadata> |
|
15 |
<oaf:pid type="pmid">26375944</oaf:pid> |
|
16 |
<dc:identifier>http://www.ncbi.nlm.nih.gov/pubmed/26375944</dc:identifier> |
|
17 |
<dc:title>Reclassification of Saccharomycodes sinensis, Proposal of Yueomyces sinensis gen. nov., comb. nov. within Saccharomycetaceae (Saccharomycetales, Saccharomycotina).</dc:title> |
|
18 |
<dc:creator>Long Wang</dc:creator> |
|
19 |
<dc:creator>Marizeth Groenewald</dc:creator> |
|
20 |
<dc:creator>Qi-Ming Wang</dc:creator> |
|
21 |
<dc:creator>Teun Boekhout</dc:creator> |
|
22 |
<dc:date>2015-09-16</dc:date> |
|
13 |
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"> |
|
14 |
<oaf:pid type="doi">10.1002/jame.20038</oaf:pid> |
|
15 |
<dc:identifier>http://dx.doi.org/10.1002/jame.20038</dc:identifier> |
|
16 |
<dc:title>"Climate and carbon cycle changes from 1850 to 2100 in MPI-ESM simulations for the Coupled Model Intercomparison Project phase 5"</dc:title> |
|
17 |
<dc:creator>Giorgetta Marco A.</dc:creator> |
|
18 |
<dc:creator>Jungclaus Johann</dc:creator> |
|
19 |
<dc:creator>Reick Christian H.</dc:creator> |
|
20 |
<dc:creator>Legutke Stephanie</dc:creator> |
|
21 |
<dc:creator>Bader Jürgen</dc:creator> |
|
22 |
<dc:creator>Böttinger Michael</dc:creator> |
|
23 |
<dc:creator>Brovkin Victor</dc:creator> |
|
24 |
<dc:creator>Crueger Traute</dc:creator> |
|
25 |
<dc:creator>Esch Monika</dc:creator> |
|
26 |
<dc:creator>Fieg Kerstin</dc:creator> |
|
27 |
<dc:creator>Glushak Ksenia</dc:creator> |
|
28 |
<dc:creator>Gayler Veronika</dc:creator> |
|
29 |
<dc:creator>Haak Helmuth</dc:creator> |
|
30 |
<dc:creator>Hollweg Heinz-Dieter</dc:creator> |
|
31 |
<dc:creator>Ilyina Tatiana</dc:creator> |
|
32 |
<dc:creator>Kinne Stefan</dc:creator> |
|
33 |
<dc:creator>Kornblueh Luis</dc:creator> |
|
34 |
<dc:creator>Matei Daniela</dc:creator> |
|
35 |
<dc:creator>Mauritsen Thorsten</dc:creator> |
|
36 |
<dc:creator>Mikolajewicz Uwe</dc:creator> |
|
37 |
<dc:creator>Mueller Wolfgang</dc:creator> |
|
38 |
<dc:creator>Notz Dirk</dc:creator> |
|
39 |
<dc:creator>Pithan Felix</dc:creator> |
|
40 |
<dc:creator>Raddatz Thomas</dc:creator> |
|
41 |
<dc:creator>Rast Sebastian</dc:creator> |
|
42 |
<dc:creator>Redler Rene</dc:creator> |
|
43 |
<dc:creator>Roeckner Erich</dc:creator> |
|
44 |
<dc:creator>Schmidt Hauke</dc:creator> |
|
45 |
<dc:creator>Schnur Reiner</dc:creator> |
|
46 |
<dc:creator>Segschneider Joachim</dc:creator> |
|
47 |
<dc:creator>Six Katharina D.</dc:creator> |
|
48 |
<dc:creator>Stockhause Martina</dc:creator> |
|
49 |
<dc:creator>Timmreck Claudia</dc:creator> |
|
50 |
<dc:creator>Wegner Jörg</dc:creator> |
|
51 |
<dc:creator>Widmann Heinrich</dc:creator> |
|
52 |
<dc:creator>Wieners Karl-H.</dc:creator> |
|
53 |
<dc:creator>Claussen Martin</dc:creator> |
|
54 |
<dc:creator>Marotzke Jochem</dc:creator> |
|
55 |
<dc:creator>Stevens Bjorn</dc:creator> |
|
56 |
<dc:date/> |
|
23 | 57 |
<dc:description/> |
24 | 58 |
<dc:type>publication</dc:type> |
25 |
|
|
26 |
<oaf:relatedIdentifier entityType="dataset" inverseRelationType="isRelatedTo" |
|
27 |
relatedIdentifierType="dnet" |
|
28 |
relationType="isRelatedTo">dli_resolver::d90bf4e9a54d2dffc53b5ec6ce7c2dd6</oaf:relatedIdentifier> |
|
29 |
<oaf:relatedIdentifier entityType="dataset" inverseRelationType="isRelatedTo" |
|
30 |
relatedIdentifierType="dnet" |
|
31 |
relationType="isRelatedTo">dli_resolver::17a9492cc1632b6b95c6e7067d145162</oaf:relatedIdentifier> |
|
32 |
<oaf:relatedIdentifier entityType="dataset" inverseRelationType="isRelatedTo" |
|
33 |
relatedIdentifierType="dnet" |
|
34 |
relationType="isRelatedTo">dli_resolver::12e836f393a607172c54013f7a4d9816</oaf:relatedIdentifier> |
|
35 |
<oaf:relatedIdentifier entityType="dataset" inverseRelationType="isRelatedTo" |
|
36 |
relatedIdentifierType="dnet" |
|
37 |
relationType="isRelatedTo">dli_resolver::c7edca93b5d2524986f2dc1b2c55f600</oaf:relatedIdentifier> |
|
59 |
<dc:publisher>Wiley-Blackwell</dc:publisher> |
|
38 | 60 |
</metadata> |
39 |
<oaf:about> |
|
61 |
<oaf:about xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|
40 | 62 |
<oaf:datainfo> |
41 | 63 |
<oaf:completionStatus>complete</oaf:completionStatus> |
42 | 64 |
|
43 |
<oaf:collectedFrom completionStatus="complete" id="pubmed______" name="PubMed"/>
|
|
65 |
<oaf:resolvedFrom id="dli_________::crossref" name="Crossref" completionStatus="complete"/>
|
|
44 | 66 |
|
45 | 67 |
</oaf:datainfo> |
46 | 68 |
</oaf:about> |
47 |
|
|
48 |
</record> |
|
69 |
</oai:record> |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dedup/cc/MindistSearchMapper.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.IOException; |
4 | 4 |
|
5 |
import org.apache.commons.logging.Log; |
|
6 |
import org.apache.commons.logging.LogFactory; |
|
5 | 7 |
import org.apache.hadoop.io.Text; |
6 | 8 |
import org.apache.hadoop.mapreduce.Mapper; |
7 | 9 |
|
... | ... | |
10 | 12 |
*/ |
11 | 13 |
public class MindistSearchMapper extends Mapper<Text, VertexWritable, Text, VertexWritable> { |
12 | 14 |
|
15 |
private static final Log log = LogFactory.getLog(MindistSearchMapper.class); |
|
16 |
|
|
17 |
private boolean debug = false; |
|
18 |
|
|
13 | 19 |
@Override |
20 |
protected void setup(Mapper.Context context) throws IOException, InterruptedException { |
|
21 |
super.setup(context); |
|
22 |
|
|
23 |
debug = context.getConfiguration().getBoolean("mindist_DEBUG", false); |
|
24 |
log.info("debug mode: " + debug); |
|
25 |
} |
|
26 |
|
|
27 |
@Override |
|
14 | 28 |
protected void map(Text key, VertexWritable value, Context context) throws IOException, InterruptedException { |
15 | 29 |
|
16 |
context.write(key, value);
|
|
30 |
emit(key, value, context);
|
|
17 | 31 |
if (value.isActivated()) { |
18 |
VertexWritable writable = new VertexWritable();
|
|
19 |
for (Text neighborVertex : value.getEdges()) {
|
|
20 |
if (!neighborVertex.toString().equals(value.getVertexId().toString())) {
|
|
21 |
writable.setVertexId(value.getVertexId());
|
|
22 |
writable.setEdges(null);
|
|
23 |
context.write(neighborVertex, writable);
|
|
32 |
VertexWritable vertex = new VertexWritable();
|
|
33 |
for (Text edge : value.getEdges()) {
|
|
34 |
if (!edge.toString().equals(value.getVertexId().toString())) {
|
|
35 |
vertex.setVertexId(value.getVertexId());
|
|
36 |
vertex.setEdges(null);
|
|
37 |
emit(edge, vertex, context);
|
|
24 | 38 |
} |
25 | 39 |
} |
26 | 40 |
} |
27 | 41 |
} |
28 | 42 |
|
43 |
private void emit(final Text key, final VertexWritable vertex, final Context context) throws IOException, InterruptedException { |
|
44 |
context.write(key, vertex); |
|
45 |
if (debug) { |
|
46 |
log.info(vertex.toJSON()); |
|
47 |
} |
|
48 |
} |
|
49 |
|
|
29 | 50 |
} |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dedup/cc/MindistSearchReducer.java | ||
---|---|---|
15 | 15 |
|
16 | 16 |
private static final Log log = LogFactory.getLog(MindistSearchReducer.class); |
17 | 17 |
|
18 |
public static final String UPDATE_COUNTER = "UpdateCounter"; |
|
19 |
public static final String SKIPPED = "SKIPPED"; |
|
20 |
public static final String UPDATED = "UPDATED"; |
|
21 |
|
|
18 | 22 |
private boolean depthOne; |
19 | 23 |
|
20 | 24 |
private boolean debug = false; |
... | ... | |
28 | 32 |
depthOne = true; |
29 | 33 |
} |
30 | 34 |
|
31 |
try { |
|
32 |
debug = Boolean.valueOf(context.getConfiguration().get("mindist_DEBUG")); |
|
33 |
} catch(Throwable e) { |
|
34 |
debug = false; |
|
35 |
} |
|
35 |
debug = context.getConfiguration().getBoolean("mindist_DEBUG", false); |
|
36 | 36 |
log.info("debug mode: " + debug); |
37 | 37 |
} |
38 | 38 |
|
... | ... | |
41 | 41 |
|
42 | 42 |
VertexWritable realVertex = null; |
43 | 43 |
Text currentMinimalKey = null; |
44 |
boolean foundEdges = false; |
|
44 |
//boolean foundEdges = false;
|
|
45 | 45 |
|
46 | 46 |
if (depthOne) { |
47 | 47 |
for (VertexWritable vertex : values) { |
48 | 48 |
if (!vertex.isMessage()) { |
49 | 49 |
//log.info(String.format("found vertex with edges: %s", key.toString())); |
50 | 50 |
realVertex = vertex.clone(); |
51 |
foundEdges = true; |
|
52 | 51 |
} |
53 | 52 |
} |
54 | 53 |
|
55 | 54 |
if (realVertex == null) { |
56 |
throw new IllegalStateException(String.format("foundEdges: %s, invalid input, key: '%s'", foundEdges, key.toString())); |
|
55 |
context.getCounter(UPDATE_COUNTER, SKIPPED).increment(1); |
|
56 |
return; |
|
57 | 57 |
} |
58 | 58 |
|
59 | 59 |
realVertex.setActivated(true); |
... | ... | |
63 | 63 |
realVertex.setVertexId(key); |
64 | 64 |
} |
65 | 65 |
|
66 |
context.getCounter("UpdateCounter", "UPDATED").increment(1);
|
|
66 |
context.getCounter(UPDATE_COUNTER, UPDATED).increment(1);
|
|
67 | 67 |
} else { |
68 | 68 |
for (VertexWritable vertex : values) { |
69 | 69 |
if (!vertex.isMessage()) { |
... | ... | |
82 | 82 |
} |
83 | 83 |
} |
84 | 84 |
|
85 |
if (realVertex == null) { |
|
86 |
context.getCounter(UPDATE_COUNTER, SKIPPED).increment(1); |
|
87 |
return; |
|
88 |
} |
|
89 |
|
|
85 | 90 |
if (currentMinimalKey != null && currentMinimalKey.compareTo(realVertex.getVertexId()) < 0) { |
86 | 91 |
realVertex.setVertexId(currentMinimalKey); |
87 | 92 |
realVertex.setActivated(true); |
88 |
context.getCounter("UpdateCounter", "UPDATED").increment(1);
|
|
93 |
context.getCounter(UPDATE_COUNTER, UPDATED).increment(1);
|
|
89 | 94 |
} else { |
90 | 95 |
realVertex.setActivated(false); |
91 | 96 |
} |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dedup/DedupBuildRootsReducer.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.IOException; |
4 | 4 |
import java.util.List; |
5 |
import java.util.stream.Collectors; |
|
6 |
import java.util.stream.StreamSupport; |
|
5 | 7 |
|
6 | 8 |
import com.google.common.collect.Iterables; |
7 | 9 |
import com.google.common.collect.Lists; |
... | ... | |
29 | 31 |
private DedupConfig dedupConf; |
30 | 32 |
private Ontologies ontologies; |
31 | 33 |
|
32 |
@Override
|
|
33 |
protected void setup(final Context context) throws IOException, InterruptedException {
|
|
34 |
super.setup(context);
|
|
35 |
dedupConf = DedupConfig.load(context.getConfiguration().get(JobParams.DEDUP_CONF));
|
|
36 |
System.out.println("dedup buildRoots reducer\n\nwf conf: " + dedupConf.toString());
|
|
34 |
@Override
|
|
35 |
protected void setup(final Context context) throws IOException, InterruptedException {
|
|
36 |
super.setup(context);
|
|
37 |
dedupConf = DedupConfig.load(context.getConfiguration().get(JobParams.DEDUP_CONF));
|
|
38 |
System.out.println("dedup buildRoots reducer\n\nwf conf: " + dedupConf.toString());
|
|
37 | 39 |
|
38 |
ontologies = OntologyLoader.loadOntologies(context.getConfiguration().get(JobParams.ONTOLOGIES));
|
|
39 |
System.out.println("ontologies: " + ontologies.toJson(true));
|
|
40 |
ontologies = OntologyLoader.loadOntologies(context.getConfiguration().get(JobParams.ONTOLOGIES));
|
|
41 |
System.out.println("ontologies: " + ontologies.toJson(true));
|
|
40 | 42 |
|
41 |
}
|
|
43 |
}
|
|
42 | 44 |
|
43 |
@Override |
|
44 |
protected void reduce(final Text key, final Iterable<ImmutableBytesWritable> values, final Context context) throws IOException, InterruptedException { |
|
45 |
@Override |
|
46 |
protected void reduce(final Text key, final Iterable<ImmutableBytesWritable> values, final Context context) throws IOException, InterruptedException { |
|
47 |
// ensures we're dealing with a root, otherwise returns |
|
48 |
if (!isRoot(key.toString())) { |
|
49 |
System.err.println("aborting DedupBuildRootsReducer, found non-root key: " + key); |
|
50 |
context.getCounter("DedupBuildRootsReducer", "aborted").increment(1); |
|
51 |
return; |
|
52 |
} |
|
45 | 53 |
|
46 |
// ensures we're dealing with a root, otherwise returns |
|
47 |
if (!isRoot(key.toString())) { |
|
48 |
System.err.println("aborting DedupBuildRootsReducer, found non-root key: " + key); |
|
49 |
context.getCounter("DedupBuildRootsReducer", "aborted").increment(1); |
|
50 |
return; |
|
51 |
} |
|
52 | 54 |
|
53 |
final byte[] rowkey = Bytes.toBytes(key.toString());
|
|
54 |
final List<DNGF> entities = Lists.newArrayList();
|
|
55 |
final byte[] rowkey = Bytes.toBytes(key.toString());
|
|
56 |
final List<DNGF> entities = Lists.newArrayList();
|
|
55 | 57 |
|
56 |
for (final DNGF oaf : toDNGF(values)) {
|
|
57 |
switch (oaf.getKind()) {
|
|
58 |
case entity:
|
|
59 |
entities.add(oaf);
|
|
60 |
break;
|
|
61 |
case relation:
|
|
62 |
handleRels(context, rowkey, oaf, true);
|
|
63 |
break;
|
|
64 |
default:
|
|
65 |
break;
|
|
66 |
}
|
|
67 |
}
|
|
68 |
// build and emit the root body
|
|
69 |
final DNGF.Builder builder = DNGFEntityMerger.merge(dedupConf, key.toString(), entities);
|
|
70 |
if (entities.size() < JobParams.MAX_COUNTERS) {
|
|
71 |
context.getCounter(dedupConf.getWf().getEntityType() + " root group size", lpad(entities.size())).increment(1);
|
|
72 |
} else {
|
|
73 |
context.getCounter(dedupConf.getWf().getEntityType() + " root group size", "> " + JobParams.MAX_COUNTERS).increment(1);
|
|
74 |
}
|
|
58 |
for (final DNGF oaf : toDNGF(values)) {
|
|
59 |
switch (oaf.getKind()) {
|
|
60 |
case entity:
|
|
61 |
entities.add(oaf);
|
|
62 |
break;
|
|
63 |
case relation:
|
|
64 |
handleRels(context, rowkey, oaf, true);
|
|
65 |
break;
|
|
66 |
default:
|
|
67 |
break;
|
|
68 |
}
|
|
69 |
}
|
|
70 |
// build and emit the root body
|
|
71 |
final DNGF.Builder builder = DNGFEntityMerger.merge(dedupConf, key.toString(), entities);
|
|
72 |
if (entities.size() < JobParams.MAX_COUNTERS) {
|
|
73 |
context.getCounter(dedupConf.getWf().getEntityType() + " root group size", lpad(entities.size())).increment(1);
|
|
74 |
} else {
|
|
75 |
context.getCounter(dedupConf.getWf().getEntityType() + " root group size", "> " + JobParams.MAX_COUNTERS).increment(1);
|
|
76 |
}
|
|
75 | 77 |
|
76 |
final DNGF dngf = builder.build(); |
|
77 |
final DNGFEntity entity = dngf.getEntity(); |
|
78 | 78 |
|
79 |
try { |
|
79 |
final DNGF dngf = builder.build(); |
|
80 |
final DNGFEntity entity = dngf.getEntity(); |
|
80 | 81 |
|
81 |
context.write(new ImmutableBytesWritable(Bytes.toBytes(entity.getId())), asPut(dngf)); |
|
82 |
} catch (Throwable e) { |
|
83 |
System.out.println("Exception dngf = " + dngf.toString()); |
|
84 |
context.getCounter(entity.getType().toString(), e.getClass().getName()).increment(1); |
|
85 |
} |
|
82 |
try { |
|
86 | 83 |
|
87 |
context.getCounter(entity.getType().toString(), "root").increment(1); |
|
84 |
context.write(new ImmutableBytesWritable(Bytes.toBytes(entity.getId())), asPut(dngf)); |
|
85 |
} catch (Throwable e) { |
|
86 |
System.out.println("Exception dngf = " + dngf.toString()); |
|
87 |
context.getCounter(entity.getType().toString(), e.getClass().getName()).increment(1); |
|
88 |
} |
|
88 | 89 |
|
89 |
// add person rels TODO: remove this hack |
|
90 |
// context.getCounter("hack", "personResult out").increment(personMap.size()); |
|
90 |
context.getCounter(entity.getType().toString(), "root").increment(1); |
|
91 | 91 |
|
92 |
} |
|
92 |
// add person rels TODO: remove this hack |
|
93 |
// context.getCounter("hack", "personResult out").increment(personMap.size()); |
|
93 | 94 |
|
94 |
private Iterable<DNGF> toDNGF(final Iterable<ImmutableBytesWritable> values) { |
|
95 |
return Iterables.transform(values, ibw -> DNGFDecoder.decode(ibw.copyBytes()).getDNGF()); |
|
96 |
} |
|
95 |
} |
|
97 | 96 |
|
98 |
private void handleRels(final Context context, final byte[] rowkey, final DNGF rel, final boolean hack) throws IOException, InterruptedException { |
|
97 |
private Iterable<DNGF> toDNGF(final Iterable<ImmutableBytesWritable> values) { |
|
98 |
return StreamSupport.stream(values.spliterator(), false).map(ibw -> DNGFDecoder.decode(ibw.copyBytes()).getDNGF()).collect(Collectors.toList()); |
|
99 |
} |
|
99 | 100 |
|
100 |
if (hack && checkHack(new String(rowkey), rel)) {
|
|
101 |
context.getCounter("hack", "personResult in").increment(1);
|
|
102 |
} else {
|
|
101 |
private void handleRels(final Context context, final byte[] rowkey, final DNGF rel, final boolean hack) throws IOException, InterruptedException {
|
|
102 |
// emit relation from the root to the related entities
|
|
103 |
DNGFDecoder decoder = rootToEntity(rowkey, rel, context);
|
|
103 | 104 |
|
104 |
// emit relation from the root to the related entities |
|
105 |
DNGFDecoder decoder = rootToEntity(rowkey, rel, context); |
|
106 |
context.write(new ImmutableBytesWritable(rowkey), asPutByCollectedFrom(decoder.getDNGF())); |
|
107 | 105 |
|
108 |
// emit relation from the related entities to the root |
|
109 |
decoder = entityToRoot(rowkey, rel, context); |
|
110 |
final byte[] revKey = Bytes.toBytes(decoder.relSourceId()); |
|
111 |
context.write(new ImmutableBytesWritable(revKey), asPutByCollectedFrom(decoder.getDNGF())); |
|
112 | 106 |
|
113 |
//context.getCounter(FIXED_RELATION, decoder.getRelDescriptor().shortQualifier() + " [entity <-> root]").increment(2); |
|
107 |
if ("50|dedup_wf_001::cddddb031b9f6c85046067c0cc9ad147".equals(decoder.getDNGFRel().getSource())) |
|
108 |
{ |
|
109 |
System.out.println(String.format("Writing relation %s %s %s ", decoder.getDNGFRel().getSource(),decoder.getDNGFRel().getRelType().getClassname(),decoder.getDNGFRel().getTarget())); |
|
114 | 110 |
} |
111 |
context.write(new ImmutableBytesWritable(rowkey), asPutByCollectedFrom(decoder.getDNGF())); |
|
112 |
// emit relation from the related entities to the root |
|
113 |
decoder = entityToRoot(rowkey, rel, context); |
|
114 |
if ("50|dedup_wf_001::cddddb031b9f6c85046067c0cc9ad147".equals(decoder.getDNGFRel().getTarget())) |
|
115 |
{ |
|
116 |
System.out.println(String.format("Writing relation %s %s %s ", decoder.getDNGFRel().getSource(),decoder.getDNGFRel().getRelType().getClassname(),decoder.getDNGFRel().getTarget())); |
|
117 |
} |
|
118 |
byte[] revKey = Bytes.toBytes(decoder.relSourceId()); |
|
119 |
context.write(new ImmutableBytesWritable(revKey), asPutByCollectedFrom(decoder.getDNGF())); |
|
120 |
|
|
121 |
//context.getCounter(FIXED_RELATION, decoder.getRelDescriptor().shortQualifier() + " [entity <-> root]").increment(2); |
|
122 |
|
|
115 | 123 |
// mark relation from the related entities to the duplicate as deleted |
116 |
DNGFDecoder decoder = markDeleted(rel, true, context);
|
|
117 |
byte[] revKey = Bytes.toBytes(decoder.relSourceId());
|
|
118 |
context.write(new ImmutableBytesWritable(revKey), asPut(decoder.getDNGF()));
|
|
124 |
decoder = markDeleted(rel, true, context);
|
|
125 |
revKey = Bytes.toBytes(decoder.relSourceId());
|
|
126 |
context.write(new ImmutableBytesWritable(revKey), asPut(decoder.getDNGF()));
|
|
119 | 127 |
|
120 |
// mark relation from the related entities to the duplicate as deleted
|
|
121 |
decoder = markDeleted(rel, false, context);
|
|
122 |
revKey = Bytes.toBytes(decoder.relSourceId());
|
|
123 |
context.write(new ImmutableBytesWritable(revKey), asPut(decoder.getDNGF()));
|
|
128 |
// mark relation from the related entities to the duplicate as deleted
|
|
129 |
decoder = markDeleted(rel, false, context);
|
|
130 |
revKey = Bytes.toBytes(decoder.relSourceId());
|
|
131 |
context.write(new ImmutableBytesWritable(revKey), asPut(decoder.getDNGF()));
|
|
124 | 132 |
|
125 | 133 |
context.getCounter(FIXED_RELATION, decoder.getRelDescriptor().shortQualifier() + " mark deleted [dup <-> entity]").increment(2); |
126 | 134 |
} |
127 | 135 |
|
128 |
public boolean checkHack(final String root, final DNGF oaf) { |
|
136 |
private boolean md5matches(final String id1, final String id2) { |
|
137 |
return id1.replaceAll("^.*\\:\\:", "").equals(id2.replaceAll("^.*\\:\\:", "")); |
|
138 |
} |
|
129 | 139 |
|
130 |
boolean res; |
|
131 |
final String type = dedupConf.getWf().getEntityType(); |
|
140 |
private DNGFDecoder rootToEntity(final byte[] rootRowkey, final DNGF rel, final Context context) { |
|
141 |
return patchRelations(rootRowkey, rel, DNGFPatch.rootToEntity, context); |
|
142 |
} |
|
132 | 143 |
|
133 |
if ((type.equals(Type.publication.toString()) || type.equals(Type.dataset.toString())) && |
|
134 |
oaf.getRel().getTargetType().equals(Type.person) && !md5matches(root, oaf.getRel().getSource())) { |
|
144 |
private DNGFDecoder entityToRoot(final byte[] rootRowkey, final DNGF rel, final Context context) { |
|
145 |
return patchRelations(rootRowkey, rel, DNGFPatch.entityToRoot, context); |
|
146 |
} |
|
135 | 147 |
|
136 |
res = true; |
|
137 |
} else { |
|
138 |
res = false; |
|
139 |
} |
|
148 |
private DNGFDecoder markDeleted(final DNGF rel, final boolean reverse, final Context context) { |
|
149 |
return deleteRelations(rel, reverse, context); |
|
150 |
} |
|
140 | 151 |
|
141 |
// if (root.equals("50|dedup_wf_001::92f6197ea6f16ae554755aced832fb6f")) { |
|
142 |
// System.out.println("##################"); |
|
143 |
// System.out.println("root : " + root); |
|
144 |
// System.out.println("source: " + oaf.getRel().getSource()); |
|
145 |
// System.out.println("ckeck: " + res); |
|
146 |
// } |
|
152 |
// patches relation objects setting the source field with the root id |
|
153 |
private DNGFDecoder patchRelations(final byte[] rootRowkey, final DNGF rel, final DNGFPatch patchKind, final Context context) { |
|
154 |
final String id = new String(rootRowkey); |
|
147 | 155 |
|
148 |
return res; |
|
149 |
} |
|
156 |
if ("50|dedup_wf_001::cddddb031b9f6c85046067c0cc9ad147".equals(id)) { |
|
157 |
System.out.println("#########################\n\n"); |
|
158 |
} |
|
150 | 159 |
|
151 |
private boolean md5matches(final String id1, final String id2) { |
|
152 |
return id1.replaceAll("^.*\\:\\:", "").equals(id2.replaceAll("^.*\\:\\:", "")); |
|
153 |
} |
|
154 | 160 |
|
155 |
private DNGFDecoder rootToEntity(final byte[] rootRowkey, final DNGF rel, final Context context) { |
|
156 |
return patchRelations(rootRowkey, rel, DNGFPatch.rootToEntity, context); |
|
157 |
} |
|
161 |
final DNGFRelDecoder decoder = DNGFRelDecoder.decode(rel.getRel()); |
|
162 |
final DNGF.Builder builder = DNGF.newBuilder(rel); |
|
163 |
builder.getDataInfoBuilder().setInferred(true).setDeletedbyinference(false); |
|
164 |
switch (patchKind) { |
|
165 |
case rootToEntity: |
|
166 |
// builder.getDataInfoBuilder().setInferenceprovenance("dedup (BuildRoots p:rootToEntity)"); |
|
167 |
builder.getRelBuilder().setSource(new String(rootRowkey)); |
|
168 |
break; |
|
158 | 169 |
|
159 |
private DNGFDecoder entityToRoot(final byte[] rootRowkey, final DNGF rel, final Context context) { |
|
160 |
return patchRelations(rootRowkey, rel, DNGFPatch.entityToRoot, context); |
|
161 |
} |
|
170 |
case entityToRoot: |
|
171 |
builder.setRel(decoder.setClassId(getInverse(decoder, context))); |
|
172 |
// builder.getDataInfoBuilder().setInferenceprovenance("dedup (BuildRoots p:entityToRoot)"); |
|
173 |
builder.getRelBuilder().setSource(builder.getRel().getTarget()); |
|
174 |
builder.getRelBuilder().setTarget(new String(rootRowkey)); |
|
175 |
final Type sourceType = builder.getRelBuilder().getSourceType(); |
|
176 |
builder.getRelBuilder().setSourceType(builder.getRel().getTargetType()); |
|
177 |
builder.getRelBuilder().setTargetType(sourceType); |
|
178 |
break; |
|
162 | 179 |
|
163 |
private DNGFDecoder markDeleted(final DNGF rel, final boolean reverse, final Context context) {
|
|
164 |
return deleteRelations(rel, reverse, context);
|
|
165 |
}
|
|
180 |
default:
|
|
181 |
break;
|
|
182 |
}
|
|
166 | 183 |
|
167 |
// patches relation objects setting the source field with the root id |
|
168 |
private DNGFDecoder patchRelations(final byte[] rootRowkey, final DNGF rel, final DNGFPatch patchKind, final Context context) { |
|
169 |
final DNGFRelDecoder decoder = DNGFRelDecoder.decode(rel.getRel()); |
|
170 |
final DNGF.Builder builder = DNGF.newBuilder(rel); |
|
171 |
builder.getDataInfoBuilder().setInferred(true).setDeletedbyinference(false); |
|
172 |
switch (patchKind) { |
|
173 |
case rootToEntity: |
|
174 |
// builder.getDataInfoBuilder().setInferenceprovenance("dedup (BuildRoots p:rootToEntity)"); |
|
175 |
builder.getRelBuilder().setSource(new String(rootRowkey)); |
|
176 |
break; |
|
184 |
return DNGFDecoder.decode(builder.build()); |
|
185 |
} |
|
177 | 186 |
|
178 |
case entityToRoot: |
|
179 |
builder.setRel(decoder.setClassId(getInverse(decoder, context))); |
|
180 |
// builder.getDataInfoBuilder().setInferenceprovenance("dedup (BuildRoots p:entityToRoot)"); |
|
181 |
builder.getRelBuilder().setSource(builder.getRel().getTarget()); |
|
182 |
builder.getRelBuilder().setTarget(new String(rootRowkey)); |
|
183 |
final Type sourceType = builder.getRelBuilder().getSourceType(); |
|
184 |
builder.getRelBuilder().setSourceType(builder.getRel().getTargetType()); |
|
185 |
builder.getRelBuilder().setTargetType(sourceType); |
|
186 |
break; |
|
187 |
|
|
188 |
default: |
|
189 |
break; |
|
190 |
} |
|
191 |
|
|
192 |
return DNGFDecoder.decode(builder.build()); |
|
193 |
} |
|
194 |
|
|
195 |
private String getInverse(final DNGFRelDecoder decoder, final Context context) { |
|
196 |
final String inverse = ontologies.inverseOf(decoder.getRelDescriptor()); |
|
197 |
if (StringUtils.isBlank(inverse)) { |
|
187 |
private String getInverse(final DNGFRelDecoder decoder, final Context context) { |
|
188 |
final String inverse = ontologies.inverseOf(decoder.getRelDescriptor()); |
|
189 |
if (StringUtils.isBlank(inverse)) { |
|
198 | 190 |
//context.getCounter("unmapped relationship", decoder.getRelDescriptor().shortQualifier()).increment(1); |
199 | 191 |
return "unknown"; |
200 |
}
|
|
201 |
return inverse;
|
|
202 |
}
|
|
192 |
}
|
|
193 |
return inverse;
|
|
194 |
}
|
|
203 | 195 |
|
204 |
private DNGFDecoder deleteRelations(final DNGF rel, final boolean reverse, final Context context) {
|
|
205 |
final DNGF.Builder builder = DNGF.newBuilder(rel);
|
|
206 |
// builder.getDataInfoBuilder().setInferenceprovenance("dedup (BuildRoots d: " + reverse + ")");
|
|
207 |
builder.getDataInfoBuilder().setDeletedbyinference(true);
|
|
196 |
private DNGFDecoder deleteRelations(final DNGF rel, final boolean reverse, final Context context) {
|
|
197 |
final DNGF.Builder builder = DNGF.newBuilder(rel);
|
|
198 |
// builder.getDataInfoBuilder().setInferenceprovenance("dedup (BuildRoots d: " + reverse + ")");
|
|
199 |
builder.getDataInfoBuilder().setDeletedbyinference(true);
|
|
208 | 200 |
|
209 |
if (reverse) {
|
|
210 |
final DNGFRelDecoder decoder = DNGFRelDecoder.decode(rel.getRel());
|
|
201 |
if (reverse) {
|
|
202 |
final DNGFRelDecoder decoder = DNGFRelDecoder.decode(rel.getRel());
|
|
211 | 203 |
|
212 |
builder.setRel(decoder.setClassId(getInverse(decoder, context)));
|
|
213 |
// swap source and target
|
|
214 |
final String tmp = builder.getRel().getSource();
|
|
204 |
builder.setRel(decoder.setClassId(getInverse(decoder, context)));
|
|
205 |
// swap source and target
|
|
206 |
final String tmp = builder.getRel().getSource();
|
|
215 | 207 |
final Type sType = builder.getRel().getSourceType(); |
216 | 208 |
builder.getRelBuilder().setSource(builder.getRel().getTarget()); |
217 |
builder.getRelBuilder().setTarget(tmp);
|
|
209 |
builder.getRelBuilder().setTarget(tmp);
|
|
218 | 210 |
builder.getRelBuilder().setSourceType(builder.getRel().getTargetType()); |
219 | 211 |
builder.getRelBuilder().setTargetType(sType); |
220 | 212 |
} |
221 | 213 |
|
222 |
return DNGFDecoder.decode(builder.build()); |
Also available in: Unified diff
implemented incremental transfrom and resolving