Project

General

Profile

« Previous | Next » 

Revision 29079

#354 removing obsolete transformers/export/document transformer along with tests

View differences:

modules/icm-iis-transformers/trunk/src/test/java/eu/dnetlib/iis/transformers/export/document/WorkflowTest.java
1
package eu.dnetlib.iis.transformers.export.document;
2

  
3
import eu.dnetlib.iis.IntegrationTest;
4
import eu.dnetlib.iis.core.AbstractWorkflowTestCase;
5
import eu.dnetlib.iis.core.WorkflowConfiguration;
6
import org.junit.Test;
7
import org.junit.experimental.categories.Category;
8

  
9
/**
10
 * 
11
 * @author Dominika Tkaczyk
12
 *
13
 */
14
@Category(IntegrationTest.class)
15
public class WorkflowTest extends AbstractWorkflowTestCase {
16

  
17
    @Test
18
	public void testWorkflow() throws Exception {
19
        WorkflowConfiguration wf = new WorkflowConfiguration();
20
        wf.setTimeoutInSeconds(720);
21
        runWorkflow("eu/dnetlib/iis/transformers/export/document/sampledataproducer/oozie_app", wf);
22
    }
23

  
24
}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/oozie_app/import.txt
1
## This is a classpath-based import file (this header is required)
2
transformer_export_document classpath eu/dnetlib/iis/transformers/export/document/oozie_app
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/oozie_app/workflow.xml
1
<workflow-app xmlns="uri:oozie:workflow:0.2" name="test-transformers_export_document_sampledataproducer">
2
    <start to="producer"/>
3
    <action name="producer">
4
        <java>
5
            <job-tracker>${jobTracker}</job-tracker>
6
            <name-node>${nameNode}</name-node>
7
			<!-- The data generated by this node is deleted in this section -->
8
			<prepare>
9
				<delete path="${nameNode}${workingDir}/producer" />
10
				<mkdir path="${nameNode}${workingDir}/producer" />
11
			</prepare>
12
            <configuration>
13
                <property>
14
                    <name>mapred.job.queue.name</name>
15
                    <value>${queueName}</value>
16
                </property>
17
            </configuration>
18
            <!-- This is simple wrapper for the Java code -->
19
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
20
			<!-- The business Java code that gets to be executed -->
21
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
22
			<!-- Specification of the output ports -->
23
            <arg>-C{extracted_document_metadata,
24
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
25
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/extracted_document_metadata.json}</arg>
26
            <arg>-C{citation,
27
				eu.dnetlib.iis.citationmatching.schemas.Citation,
28
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/citations.json}</arg>
29
            <arg>-C{document_to_project,
30
				eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject,
31
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_project.json}</arg>
32
            <arg>-C{document_to_dataset,
33
				eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet,
34
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_dataset.json}</arg>
35
            <arg>-C{document_to_research_initiative,
36
				eu.dnetlib.iis.referenceextraction.researchinitiative.schemas.DocumentToResearchInitiative,
37
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_research_initiative.json}</arg>
38
            <arg>-C{document_to_document_clusters,
39
				eu.dnetlib.iis.documentsclustering.schemas.DocumentToDocumentClusters,
40
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_document_clusters.json}</arg>    
41
            <arg>-C{document_to_document_classes,
42
				eu.dnetlib.iis.documentsclassification.schemas.DocumentToDocumentClasses,
43
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_document_classes.json}</arg>    
44
            <arg>-C{document_to_document_statistics,
45
				eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics,
46
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_document_statistics.json}</arg>
47
            <arg>-C{document_with_website_usage_similarities,
48
				eu.dnetlib.iis.websiteusage.schemas.DocumentsWithWebsiteUsageSimilarities,
49
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_with_website_usage_similarities.json}</arg>
50
                           
51
			<!-- All input and output ports have to be bound to paths in HDFS, working 
52
				directory has to be specified as well -->
53
            <arg>-SworkingDir=${workingDir}/producer/working_dir</arg>
54
            <arg>-Oextracted_document_metadata=${workingDir}/producer/extracted_document_metadata</arg>
55
            <arg>-Ocitation=${workingDir}/producer/citation</arg>
56
            <arg>-Odocument_to_project=${workingDir}/producer/document_to_project</arg>
57
            <arg>-Odocument_to_dataset=${workingDir}/producer/document_to_dataset</arg>
58
            <arg>-Odocument_to_research_initiative=${workingDir}/producer/document_to_research_initiative</arg>
59
            <arg>-Odocument_to_document_clusters=${workingDir}/producer/document_to_document_clusters</arg>
60
            <arg>-Odocument_to_document_classes=${workingDir}/producer/document_to_document_classes</arg>
61
            <arg>-Odocument_to_document_statistics=${workingDir}/producer/document_to_document_statistics</arg>
62
            <arg>-Odocument_with_website_usage_similarities=${workingDir}/producer/document_with_website_usage_similarities</arg>    
63
        </java>
64
        <ok to="transformer_export_document"/>
65
        <error to="fail"/>
66
    </action>
67
    <action name="transformer_export_document">
68
        <sub-workflow>
69
            <app-path>${wf:appPath()}/transformer_export_document</app-path>
70
            <configuration>
71
                <property>
72
                    <name>jobTracker</name>
73
                    <value>${jobTracker}</value>
74
                </property>
75
                <property>
76
                    <name>nameNode</name>
77
                    <value>${nameNode}</value>
78
                </property>
79
                <property>
80
                    <name>queueName</name>
81
                    <value>${queueName}</value>
82
                </property>
83
                <!-- Working directory of the subworkflow -->
84
                <property>
85
                    <name>workingDir</name>
86
                    <value>${workingDir}/transformer_export_document/working_dir</value>
87
                </property>
88
                <!-- Input ports. -->
89
                <property>
90
                    <name>input_extracted_document_metadata</name>
91
                    <value>${workingDir}/producer/extracted_document_metadata</value>
92
                </property>
93
                <property>
94
                    <name>input_citation</name>
95
                    <value>${workingDir}/producer/citation</value>
96
                </property>
97
                <property>
98
                    <name>input_document_to_project</name>
99
                    <value>${workingDir}/producer/document_to_project</value>
100
                </property>
101
                <property>
102
                    <name>input_document_to_dataset</name>
103
                    <value>${workingDir}/producer/document_to_dataset</value>
104
                </property>
105
                <property>
106
                    <name>input_document_to_research_initiative</name>
107
                    <value>${workingDir}/producer/document_to_research_initiative</value>
108
                </property>
109
                <property>
110
                    <name>input_document_to_document_clusters</name>
111
                    <value>${workingDir}/producer/document_to_document_clusters</value>
112
                </property>
113
                <property>
114
                    <name>input_document_to_document_classes</name>
115
                    <value>${workingDir}/producer/document_to_document_classes</value>
116
                </property>
117
                <property>
118
                    <name>input_document_to_document_statistics</name>
119
                    <value>${workingDir}/producer/document_to_document_statistics</value>
120
                </property>
121
                <property>
122
                    <name>input_document_with_website_usage_similarities</name>
123
                    <value>${workingDir}/producer/document_with_website_usage_similarities</value>
124
                </property>
125
                <!-- Output port bound to given path -->
126
                <property>
127
                    <name>output_document_with_inferenced_data</name>
128
                    <value>${workingDir}/transformer_export_document/document_with_inferenced_data</value>
129
                </property>
130
            </configuration>
131
        </sub-workflow>
132
        <ok to="consumer"/>
133
        <error to="fail"/>
134
    </action>
135
    <action name="consumer">
136
		<java>
137
			<job-tracker>${jobTracker}</job-tracker>
138
			<name-node>${nameNode}</name-node>
139
			<!-- The data generated by this node is deleted in this section -->
140
			<prepare>
141
				<delete path="${nameNode}${workingDir}/consumer" />
142
				<mkdir path="${nameNode}${workingDir}/consumer" />
143
			</prepare>
144
			<configuration>
145
				<property>
146
					<name>mapred.job.queue.name</name>
147
					<value>${queueName}</value>
148
				</property>
149
			</configuration>
150
			<!-- This is simple wrapper for the Java code -->
151
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
152
			<!-- The business Java code that gets to be executed -->
153
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.TestingConsumer</arg>
154
			<!-- Specification of the input ports -->
155
			<arg>-C{document_with_inferenced_data,
156
				eu.dnetlib.iis.export.schemas.DocumentWithInferencedData,
157
				eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_with_inferenced_data.json}</arg>
158
			<!-- All input and output ports have to be bound to paths in HDFS, working 
159
				directory has to be specified as well -->
160
			<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg>
161
			<arg>-Idocument_with_inferenced_data=${workingDir}/transformer_export_document/document_with_inferenced_data</arg>
162
		</java>
163
		<ok to="end" />
164
		<error to="fail" />
165
	</action>    
166
    <kill name="fail">
167
		<message>Unfortunately, the workflow failed -- error message:
168
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
169
    </kill>
170
    <end name="end"/>
171
</workflow-app>
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_document_classes.json
1
{"documentId": "id-10", "classes": {"arXivClasses": null, "WoSClasses": [{"classLabels": ["TOXICOLOGY"], "confidenceLevel": 0.112}, {"classLabels": ["ONCOLOGY"], "confidenceLevel": 0.098}], "DDCClasses":[{"classLabels": ["Technology", "Agriculture"], "confidenceLevel": 0.403}, {"classLabels": ["Science", "Biology"], "confidenceLevel": 0.286}], "meshEuroPMCClasses": null}}
2
{"documentId": "id-2", "classes": {"arXivClasses": [{"classLabels": ["Quantitative Biology", "Cell Behavior"], "confidenceLevel": 0.102}, {"classLabels": ["Quantitative Biology", "Tissues and Organs"], "confidenceLevel": 0.101}], "WoSClasses": null, "DDCClasses":[{"classLabels": ["Science", "Science"], "confidenceLevel": 0.285}, {"classLabels": ["Technology", "Medicine & health"], "confidenceLevel": 0.198}], "meshEuroPMCClasses": null}}
3
{"documentId": "id-3", "classes": {"arXivClasses": null, "WoSClasses": [{"classLabels": ["SUBSTANCE ABUSE"], "confidenceLevel": 0.096}, {"classLabels": ["COMPUTER SCIENCE, HARDWARE & ARCHITECTURE"], "confidenceLevel": 0.089}], "DDCClasses": null, "meshEuroPMCClasses": null}}
4
{"documentId": "id-9", "classes": null}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_with_website_usage_similarities.json
1
{"documentId": "id-2", "otherDocumentId": "id-5", "covisitedSimilarity": 0.45}
2
{"documentId": "id-9", "otherDocumentId": "id-5", "covisitedSimilarity": 0.58}
3
{"documentId": "id-2", "otherDocumentId": "id-9", "covisitedSimilarity": 0.97}
4
{"documentId": "id-3", "otherDocumentId": "id-1", "covisitedSimilarity": null}
5
{"documentId": "id-4", "otherDocumentId": "id-10", "covisitedSimilarity": null}
6
{"documentId": "id-5", "otherDocumentId": "id-9", "covisitedSimilarity": 0.58}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/citations.json
1
{"sourceDocumentId": "id-1", "rawText": "abc", "destinationDocumentId": "id-2", "confidenceLevel": null}
2
{"sourceDocumentId": "id-1", "rawText": "def", "destinationDocumentId": "id-4", "confidenceLevel": null}
3
{"sourceDocumentId": "id-3", "rawText": "abc", "destinationDocumentId": "id-4", "confidenceLevel": null}
4
{"sourceDocumentId": "id-4", "rawText": "xxx", "destinationDocumentId": "id-1", "confidenceLevel": null}
5
{"sourceDocumentId": "id-4", "rawText": "xyz", "destinationDocumentId": "id-2", "confidenceLevel": null}
6
{"sourceDocumentId": "id-7", "rawText": "xyz", "destinationDocumentId": "id-4", "confidenceLevel": null}
7
{"sourceDocumentId": "id-7", "rawText": "qwerty", "destinationDocumentId": "id-1", "confidenceLevel": null}
8
{"sourceDocumentId": "id-7", "rawText": "bnmm", "destinationDocumentId": "id-2", "confidenceLevel": null}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_research_initiative.json
1
{"documentId":"id-1","egiConceptId":"egi-8095","confidenceLevel":1.5766148184367308}
2
{"documentId":"id-8","egiConceptId":"egi-0820","confidenceLevel":0.8652002245558127}
3
{"documentId":"id-1","egiConceptId":"egi-5103","confidenceLevel":1.3005493344846906}
4
{"documentId":"id-4","egiConceptId":"egi-0763","confidenceLevel":1.6733200530681511}
5
{"documentId":"id-4","egiConceptId":"egi-3463","confidenceLevel":1.6733200530681511}
6
{"documentId":"id-9","egiConceptId":"egi-5840","confidenceLevel":1.6053482043291596}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/extracted_document_metadata.json
1
{"publisher": "Tor Science Fiction", "affiliations": null, "authors": [{"authorFullName": "author-1", "affiliationPositions": null}, {"authorFullName": "author-2", "affiliationPositions": null}, {"authorFullName": "author-3", "affiliationPositions": null}], "language": "eng", "title": "Enders Game", "externalIdentifiers": null, "journal": "Journal-2", "id": "id-1", "pages": {"start": "123", "end": "128"}, "volume": null, "references": null, "year": null, "keywords": null, "issue": null, "abstract": null}
2
{"publisher": null, "affiliations": null, "authors": [{"authorFullName": "author-1", "affiliationPositions": null}, {"authorFullName": "author-2", "affiliationPositions": null}], "language": null, "title": null, "externalIdentifiers": {"id-1": "val-extr-1", "id-3": "val-extr-3"}, "journal": null, "id": "id-2", "pages": null, "volume": "124", "references": [{"position": 1, "basicMetadata": {"publisher": null, "title": "The Lord of the Rings", "url": null, "series": null, "authors": null, "volume": null, "edition": null, "source": null, "year": "2012", "issue": null, "pages": {"start": "1", "end": "236"}, "location": null}, "text": "J.R.R. Tolkien, The Lord of the Rings, 2012"}, {"position": 2, "basicMetadata": {"publisher": null, "title": "The Other Wind", "url": null, "series": null, "authors": null, "volume": "vol.23", "edition": null, "source": null, "year": "2003", "issue": null, "pages": null, "location": null}, "text": "Ursula K. Le Guin, The Other Wind, 2003"}], "year": 1970, "keywords": ["kwd_1", "kwd_3", "kwd_5"], "issue": null, "abstract": "The tales"}
3
{"publisher": "Harp3r T0rch", "affiliations": null, "authors": null, "language": "en", "title": "Small Gods", "externalIdentifiers": null, "journal": "Journal", "id": "id-3", "pages": null, "volume": "32", "references": [{"position": 1, "basicMetadata": {"publisher": null, "title": "The Lord of the Rings", "url": null, "series": null, "authors": null, "volume": null, "edition": null, "source": null, "year": "2012", "issue": null, "pages": {"start": "1", "end": "236"}, "location": null}, "text": "J.R.R. Tolkien, The Lord of the Rings, 2012"}], "year": null, "keywords": null, "issue": "4", "abstract": null}
4
{"publisher": null, "affiliations": null, "authors": null, "language": null, "title": null, "externalIdentifiers": {"id-1": "val-extr-1", "id-3": "val-extr-3"}, "journal": null, "id": "id-6", "pages": null, "volume": "vol3", "references": null, "year": 2011, "keywords": null, "issue": "6", "abstract": "A Game of Thrones"}
5
{"publisher": null, "affiliations": null, "authors": null, "language": null, "title": null, "externalIdentifiers": {"id-1": "val-extr-1", "id-3": "val-extr-3"}, "journal": null, "id": "id-4", "pages": {"start": "3", "end": "503"}, "volume": null, "references": null, "year": 1997, "keywords": ["kwd_1", "kwd_3", "kwd_5"], "issue": "2", "abstract": "Interview with the Vampire"}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_document_clusters.json
1
{"documentId": "id-4", "clusters": {"clusteringMethod1ClusterId": 123, "clusteringMethod2ClusterId": 90}}
2
{"documentId": "id-10", "clusters": {"clusteringMethod1ClusterId": 123, "clusteringMethod2ClusterId": 90}}
3
{"documentId": "id-7", "clusters": {"clusteringMethod1ClusterId": 45, "clusteringMethod2ClusterId": 90}}
4
{"documentId": "id-8", "clusters": {"clusteringMethod1ClusterId": 69, "clusteringMethod2ClusterId": null}}
5
{"documentId": "id-1", "clusters": null}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_with_inferenced_data.json
1
{"id": "id-1", "title": "Enders Game", "abstract": null, "language": "eng", "keywords": null, "externalIdentifiers": null, "journal": "Journal-2", "year": null, "publisher": "Tor Science Fiction", "text": null, "projectIds": null, "authorIds": null, "matchedCitationDocumentIds": [{"id" : "id-2", "text" : "abc"},{"id" : "id-4", "text" : "def"}], "referencedDataSetIds": ["8095", "5103"], "researchInitiativeConceptIds" : ["egi-8095", "egi-5103"], "clusters": null, "classes": null, "statistics": {"citationsFromAllPapers": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "citationsFromPublishedPapers": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}}, "websiteUsageSimilarities": null}
2
{"id": "id-2", "title": null, "abstract": "The tales", "language": null, "keywords": ["kwd_1", "kwd_3", "kwd_5"], "externalIdentifiers": {"id-1": "val-extr-1", "id-3": "val-extr-3"}, "journal": null, "year": 1970, "publisher": null, "text": null, "projectIds": ["248095"], "authorIds": null, "matchedCitationDocumentIds": null, "referencedDataSetIds": null, "researchInitiativeConceptIds" : null, "clusters": null, "classes": {"arXivClasses": [{"classLabels": ["Quantitative Biology", "Cell Behavior"], "confidenceLevel": 0.102}, {"classLabels": ["Quantitative Biology", "Tissues and Organs"], "confidenceLevel": 0.101}], "WoSClasses": null, "DDCClasses":[{"classLabels": ["Science", "Science"], "confidenceLevel": 0.285}, {"classLabels": ["Technology", "Medicine & health"], "confidenceLevel": 0.198}], "meshEuroPMCClasses": null}, "statistics": {"citationsFromAllPapers": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "citationsFromPublishedPapers": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}}, "websiteUsageSimilarities": [{"documentId": "id-5", "covisitedSimilarity": 0.45}, {"documentId": "id-9", "covisitedSimilarity": 0.97}]}
3
{"id": "id-3", "title": "Small Gods", "abstract": null, "language": "en", "keywords": null, "externalIdentifiers": null, "journal": "Journal", "year": null, "publisher": "Harp3r T0rch", "text": null, "projectIds": ["300820", "275103"], "authorIds": null, "matchedCitationDocumentIds": [{"id" : "id-4", "text" : "abc"}], "referencedDataSetIds": null, "researchInitiativeConceptIds" : null, "clusters": null, "classes": {"arXivClasses": null, "WoSClasses": [{"classLabels": ["SUBSTANCE ABUSE"], "confidenceLevel": 0.096}, {"classLabels": ["COMPUTER SCIENCE, HARDWARE & ARCHITECTURE"], "confidenceLevel": 0.089}], "DDCClasses": null, "meshEuroPMCClasses": null}, "statistics": {"citationsFromAllPapers": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "citationsFromPublishedPapers": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}}, "websiteUsageSimilarities": [{"documentId": "id-1", "covisitedSimilarity": null}]}
4
{"id": "id-4", "title": null, "abstract": "Interview with the Vampire", "language": null, "keywords": ["kwd_1", "kwd_3", "kwd_5"], "externalIdentifiers": {"id-1": "val-extr-1", "id-3": "val-extr-3"}, "journal": null, "year": 1997, "publisher": null, "text": null, "projectIds": null, "authorIds": null, "matchedCitationDocumentIds": [{"id" : "id-1", "text" : "xxx"},{"id" : "id-2", "text" : "xyz"}], "referencedDataSetIds": ["0763"], "researchInitiativeConceptIds" : ["egi-0763", "egi-3463"], "clusters": {"clusteringMethod1ClusterId": 123, "clusteringMethod2ClusterId": 90}, "classes": null, "statistics": {"citationsFromAllPapers": {"numberOfCitations": 3, "numberOfCitationsPerYear": {"2010": 1, "2001": 2}}, "citationsFromPublishedPapers": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2010": 1, "2001": 1}}}, "websiteUsageSimilarities": [{"documentId": "id-10", "covisitedSimilarity": null}]}
5
{"id": "id-5", "title": null, "abstract": null, "language": null, "keywords": null, "externalIdentifiers": null, "journal": null, "year": null, "publisher": null, "text": null, "projectIds": null, "authorIds": null, "matchedCitationDocumentIds": null, "referencedDataSetIds": null, "researchInitiativeConceptIds" : null, "clusters": null, "classes": null, "statistics": null, "websiteUsageSimilarities": [{"documentId": "id-9", "covisitedSimilarity": 0.58}]}
6
{"id": "id-6", "title": null, "abstract": "A Game of Thrones", "language": null, "keywords": null, "externalIdentifiers": {"id-1": "val-extr-1", "id-3": "val-extr-3"}, "journal": null, "year": 2011, "publisher": null, "text": null, "projectIds": null, "authorIds": null, "matchedCitationDocumentIds": null, "referencedDataSetIds": null, "researchInitiativeConceptIds" : null, "clusters": null, "classes": null, "statistics": null, "websiteUsageSimilarities": null}
7
{"id": "id-7", "title": null, "abstract": null, "language": null, "keywords": null, "externalIdentifiers": null, "journal": null, "year": null, "publisher": null, "text": null, "projectIds": null, "authorIds": null, "matchedCitationDocumentIds": [{"id" : "id-4", "text" : "xyz"},{"id" : "id-1", "text" : "qwerty"},{"id" : "id-2", "text" : "bnmm"}], "referencedDataSetIds": null, "researchInitiativeConceptIds" : null, "clusters": {"clusteringMethod1ClusterId": 45, "clusteringMethod2ClusterId": 90}, "classes": null, "statistics": null, "websiteUsageSimilarities": null}
8
{"id": "id-8", "title": null, "abstract": null, "language": null, "keywords": null, "externalIdentifiers": null, "journal": null, "year": null, "publisher": null, "text": null, "projectIds": ["240763", "275840"], "authorIds": null, "matchedCitationDocumentIds": null, "referencedDataSetIds": ["0820"], "researchInitiativeConceptIds" : ["egi-0820"], "clusters": {"clusteringMethod1ClusterId": 69, "clusteringMethod2ClusterId": null}, "classes": null, "statistics": null, "websiteUsageSimilarities": null}
9
{"id": "id-9", "title": null, "abstract": null, "language": null, "keywords": null, "externalIdentifiers": null, "journal": null, "year": null, "publisher": null, "text": null, "projectIds": null, "authorIds": null, "matchedCitationDocumentIds": null, "referencedDataSetIds": ["5840"], "researchInitiativeConceptIds" : ["egi-5840"], "clusters": null, "classes": null, "statistics": null, "websiteUsageSimilarities": [{"documentId": "id-5", "covisitedSimilarity": 0.58}]}
10
{"id": "id-10", "title": null, "abstract": null, "language": null, "keywords": null, "externalIdentifiers": null, "journal": null, "year": null, "publisher": null, "text": null, "projectIds": null, "authorIds": null, "matchedCitationDocumentIds": null, "referencedDataSetIds": null, "researchInitiativeConceptIds" : null, "clusters": {"clusteringMethod1ClusterId": 123, "clusteringMethod2ClusterId": 90}, "classes": {"arXivClasses": null, "WoSClasses": [{"classLabels": ["TOXICOLOGY"], "confidenceLevel": 0.112}, {"classLabels": ["ONCOLOGY"], "confidenceLevel": 0.098}], "DDCClasses":[{"classLabels": ["Technology", "Agriculture"], "confidenceLevel": 0.403}, {"classLabels": ["Science", "Biology"], "confidenceLevel": 0.286}], "meshEuroPMCClasses": null}, "statistics": null, "websiteUsageSimilarities": null}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_dataset.json
1
{"documentId":"id-1","datasetId":"8095","confidenceLevel":1.5766148184367308}
2
{"documentId":"id-8","datasetId":"0820","confidenceLevel":0.8652002245558127}
3
{"documentId":"id-1","datasetId":"5103","confidenceLevel":1.3005493344846906}
4
{"documentId":"id-4","datasetId":"0763","confidenceLevel":1.6733200530681511}
5
{"documentId":"id-9","datasetId":"5840","confidenceLevel":1.6053482043291596}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_project.json
1
{"documentId":"id-2","projectId":"248095","confidenceLevel":1.5766148184367308}
2
{"documentId":"id-3","projectId":"300820","confidenceLevel":0.8652002245558127}
3
{"documentId":"id-3","projectId":"275103","confidenceLevel":1.3005493344846906}
4
{"documentId":"id-8","projectId":"240763","confidenceLevel":1.6733200530681511}
5
{"documentId":"id-8","projectId":"275840","confidenceLevel":1.6053482043291596}
modules/icm-iis-transformers/trunk/src/test/resources/eu/dnetlib/iis/transformers/export/document/sampledataproducer/data/document_to_document_statistics.json
1
{"documentId": "id-1", "statistics": {"citationsFromAllPapers": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "citationsFromPublishedPapers": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}}}
2
{"documentId": "id-2", "statistics": {"citationsFromAllPapers": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "citationsFromPublishedPapers": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}}}
3
{"documentId": "id-3", "statistics": {"citationsFromAllPapers": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "citationsFromPublishedPapers": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}}}
4
{"documentId": "id-4", "statistics": {"citationsFromAllPapers": {"numberOfCitations": 3, "numberOfCitationsPerYear": {"2010": 1, "2001": 2}}, "citationsFromPublishedPapers": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2010": 1, "2001": 1}}}}
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/export/document/oozie_app/lib/scripts/transformer.pig
1
define avro_load_extracted_document_metadata
2
org.apache.pig.piggybank.storage.avro.AvroStorage(
3
'input_schema_class', '$schema_input_extracted_document_metadata');
4

  
5
define avro_load_citation
6
org.apache.pig.piggybank.storage.avro.AvroStorage(
7
'input_schema_class', '$schema_input_citation');
8

  
9
define avro_load_document_to_project
10
org.apache.pig.piggybank.storage.avro.AvroStorage(
11
'input_schema_class', '$schema_input_document_to_project');
12

  
13
define avro_load_document_to_dataset
14
org.apache.pig.piggybank.storage.avro.AvroStorage(
15
'input_schema_class', '$schema_input_document_to_dataset');
16

  
17
define avro_load_document_to_research_initiative
18
org.apache.pig.piggybank.storage.avro.AvroStorage(
19
'input_schema_class', '$schema_input_document_to_research_initiative');
20

  
21
define avro_load_document_to_document_clusters
22
org.apache.pig.piggybank.storage.avro.AvroStorage(
23
'input_schema_class', '$schema_input_document_to_document_clusters');
24

  
25
define avro_load_document_to_document_classes
26
org.apache.pig.piggybank.storage.avro.AvroStorage(
27
'input_schema_class', '$schema_input_document_to_document_classes');
28

  
29
define avro_load_document_to_document_statistics
30
org.apache.pig.piggybank.storage.avro.AvroStorage(
31
'input_schema_class', '$schema_input_document_to_document_statistics');
32

  
33
define avro_load_document_with_website_usage_similarities
34
org.apache.pig.piggybank.storage.avro.AvroStorage(
35
'input_schema_class', '$schema_input_document_with_website_usage_similarities');
36

  
37

  
38
define avro_store_document_with_inferenced_data
39
org.apache.pig.piggybank.storage.avro.AvroStorage(
40
'index', '0',
41
'output_schema_class', '$schema_output_document_with_inferenced_data');
42

  
43

  
44
define FIRST_NOT_NULL_STR eu.dnetlib.iis.transformers.udfs.StringFirstNotEmpty;
45
define NULL_EMPTY eu.dnetlib.iis.transformers.udfs.EmptyBagToNull;
46
define NULL_EMPTY_TUPLE_FIELDS eu.dnetlib.iis.transformers.udfs.NullTupleFieldsToNull;
47
define CREATE_ARRAY eu.dnetlib.iis.transformers.udfs.NullToEmptyBag;
48

  
49
extractedDocument = load '$input_extracted_document_metadata' using avro_load_extracted_document_metadata;
50
citation = load '$input_citation' using avro_load_citation;
51
documentToProject = load '$input_document_to_project' using avro_load_document_to_project;
52
documentToDataset = load '$input_document_to_dataset' using avro_load_document_to_dataset;
53
documentToResearchInitiative = load '$input_document_to_research_initiative' using avro_load_document_to_research_initiative;
54
documentToDocumentClusters = load '$input_document_to_document_clusters' using avro_load_document_to_document_clusters;
55
documentToDocumentClasses = load '$input_document_to_document_classes' using avro_load_document_to_document_classes;
56
documentToDocumentStatistics = load '$input_document_to_document_statistics' using avro_load_document_to_document_statistics;
57
documentWithWebsiteUsageSimilarities = load '$input_document_with_website_usage_similarities' using avro_load_document_with_website_usage_similarities;
58

  
59
documentToProjectGroupped = group documentToProject by documentId;
60
documentToProjectWithArrays = foreach documentToProjectGroupped {
61
    projectIds = foreach documentToProject generate projectId;
62
    generate group as id, projectIds;
63
}
64

  
65
documentToDatasetGroupped = group documentToDataset by documentId;
66
documentToDatasetWithArrays = foreach documentToDatasetGroupped {
67
    datasetIds = foreach documentToDataset generate datasetId;
68
    generate group as id, datasetIds;
69
}
70

  
71
citationGroupped = group citation by sourceDocumentId;
72
citationGrouppedWithText = foreach citationGroupped {
73
    idWithText = foreach citation generate destinationDocumentId as id, rawText as text;
74
    generate group as id, idWithText as citations;
75
}
76

  
77
joined1 = join documentToProjectWithArrays by id full, citationGrouppedWithText by id;
78
joined1Cleaned = foreach joined1 generate
79
    FIRST_NOT_NULL_STR(documentToProjectWithArrays::id, citationGrouppedWithText::id) as id,
80
    NULL_EMPTY(null) as authorIds,
81
    citationGrouppedWithText::citations as matchedCitationDocumentIds,
82
    NULL_EMPTY(documentToProjectWithArrays::projectIds) as projectIds;
83

  
84
joined2 = join joined1Cleaned by id full, documentToDatasetWithArrays by id;
85
joined2Cleaned = foreach joined2 generate
86
    FIRST_NOT_NULL_STR(joined1Cleaned::id, documentToDatasetWithArrays::id) as id,
87
    joined1Cleaned::authorIds as authorIds,
88
    joined1Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
89
    joined1Cleaned::projectIds as projectIds,
90
    NULL_EMPTY(documentToDatasetWithArrays::datasetIds) as datasetIds;
91

  
92
joined3 = join joined2Cleaned by id full, documentToDocumentClusters by documentId;
93
joined3Cleaned = foreach joined3 generate
94
    FIRST_NOT_NULL_STR(joined2Cleaned::id, documentToDocumentClusters::documentId) as id,
95
    joined2Cleaned::authorIds as authorIds,
96
    joined2Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
97
    joined2Cleaned::projectIds as projectIds,
98
    joined2Cleaned::datasetIds as datasetIds,
99
    documentToDocumentClusters::clusters as clusters;
100

  
101
joined4 = join joined3Cleaned by id full, documentToDocumentClasses by documentId;
102
joined4Cleaned = foreach joined4 generate
103
    FIRST_NOT_NULL_STR(joined3Cleaned::id, documentToDocumentClasses::documentId) as id,
104
    joined3Cleaned::authorIds as authorIds,
105
    joined3Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
106
    joined3Cleaned::projectIds as projectIds,
107
    joined3Cleaned::datasetIds as datasetIds,
108
    joined3Cleaned::clusters as clusters,
109
    documentToDocumentClasses::classes as classes;
110

  
111
joined5 = join joined4Cleaned by id full, documentToDocumentStatistics by documentId;
112
joined5Cleaned = foreach joined5 generate
113
    FIRST_NOT_NULL_STR(joined4Cleaned::id, documentToDocumentStatistics::documentId) as id,
114
    joined4Cleaned::authorIds as authorIds,
115
    joined4Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
116
    joined4Cleaned::projectIds as projectIds,
117
    joined4Cleaned::datasetIds as datasetIds,
118
    joined4Cleaned::clusters as clusters,
119
    joined4Cleaned::classes as classes,
120
    documentToDocumentStatistics::statistics as statistics;
121

  
122
documentWithWebsiteUsageSimilaritiesGroupped = group documentWithWebsiteUsageSimilarities by documentId;
123
outputSimilarities = foreach documentWithWebsiteUsageSimilaritiesGroupped {
124
    websiteUsageSimilarities = foreach documentWithWebsiteUsageSimilarities generate otherDocumentId as documentId, covisitedSimilarity as covisitedSimilarity;
125
    generate group as id, websiteUsageSimilarities;
126
}
127

  
128
joined6 = join joined5Cleaned by id full, outputSimilarities by id;
129
joined6Cleaned = foreach joined6 generate
130
    FIRST_NOT_NULL_STR(joined5Cleaned::id, outputSimilarities::id) as id,
131
    joined5Cleaned::authorIds as authorIds,
132
    joined5Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
133
    joined5Cleaned::projectIds as projectIds,
134
    joined5Cleaned::datasetIds as datasetIds,
135
    joined5Cleaned::clusters as clusters,
136
    joined5Cleaned::classes as classes,
137
    joined5Cleaned::statistics as statistics,
138
    outputSimilarities::websiteUsageSimilarities as websiteUsageSimilarities;
139

  
140
researchInitiativeGroupped = group documentToResearchInitiative by documentId;
141
researchInitiative = foreach researchInitiativeGroupped {
142
    ids = foreach documentToResearchInitiative generate egiConceptId;
143
    generate group as id, ids as researchInitiativeConceptIds;
144
}
145

  
146
joined7 = join joined6Cleaned by id full, researchInitiative by id;
147
joined7Cleaned = foreach joined7 generate
148
    FIRST_NOT_NULL_STR(joined6Cleaned::id, researchInitiative::id) as id,
149
    joined6Cleaned::authorIds as authorIds,
150
    joined6Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
151
    joined6Cleaned::projectIds as projectIds,
152
    joined6Cleaned::datasetIds as datasetIds,
153
    researchInitiative::researchInitiativeConceptIds as researchInitiativeConceptIds,
154
    joined6Cleaned::clusters as clusters,
155
    joined6Cleaned::classes as classes,
156
    joined6Cleaned::statistics as statistics,
157
    joined6Cleaned::websiteUsageSimilarities as websiteUsageSimilarities;
158

  
159
joinedFull = join extractedDocument by id full, joined7Cleaned by id;
160
joinedFullCleaned = foreach joinedFull generate 
161
    FIRST_NOT_NULL_STR(extractedDocument::id, joined7Cleaned::id) as id, 
162
    extractedDocument::title as title, extractedDocument::abstract as abstract, 
163
    extractedDocument::language as language, extractedDocument::keywords as keywords, 
164
    extractedDocument::externalIdentifiers as externalIdentifiers, 
165
    extractedDocument::journal as journal, extractedDocument::year as year, 
166
    extractedDocument::publisher as publisher, (chararray)null as text,
167
    joined7Cleaned::projectIds as projectIds,
168
    joined7Cleaned::authorIds as authorIds,
169
    joined7Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
170
    joined7Cleaned::datasetIds as referencedDataSetIds,
171
    joined7Cleaned::researchInitiativeConceptIds as researchInitiativeConceptIds,
172
    joined7Cleaned::clusters as clusters,
173
    joined7Cleaned::classes as classes,
174
    joined7Cleaned::statistics as statistics,
175
    joined7Cleaned::websiteUsageSimilarities as websiteUsageSimilarities;
176

  
177
store joinedFullCleaned into '$output_document_with_inferenced_data' using avro_store_document_with_inferenced_data;
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/export/document/oozie_app/workflow.xml
1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="transformers_export_document">
2
	
3
	<parameters>
4
		<property>
5
			<name>input_extracted_document_metadata</name>
6
		</property>
7
		<property>
8
			<name>input_citation</name>
9
		</property>
10
		<property>
11
			<name>input_document_to_project</name>
12
		</property>
13
		<property>
14
			<name>input_document_to_dataset</name>
15
		</property>
16
        <property>
17
			<name>input_document_to_research_initiative</name>
18
		</property>
19
		<property>
20
			<name>input_document_to_document_clusters</name>
21
		</property>
22
		<property>
23
			<name>input_document_to_document_classes</name>
24
		</property>
25
		<property>
26
			<name>input_document_to_document_statistics</name>
27
		</property>
28
		<property>
29
			<name>input_document_with_website_usage_similarities</name>
30
		</property>
31
		<property>
32
			<name>output_document_with_inferenced_data</name>
33
		</property>
34
	</parameters>
35
    
36
    <start to="transformer"/>
37
    <action name="transformer">
38
        <pig>
39
            <job-tracker>${jobTracker}</job-tracker>
40
            <name-node>${nameNode}</name-node>
41
			<!-- The data generated by this node is deleted in this section -->
42
			<prepare>
43
				<delete path="${nameNode}${workingDir}/transformer" />
44
				<delete path="${nameNode}${output_document_with_inferenced_data}" />
45
				<mkdir path="${nameNode}${workingDir}/transformer" />
46
			</prepare>
47
            <configuration>
48
                <property>
49
                    <name>mapred.job.queue.name</name>
50
                    <value>${queueName}</value>
51
                </property>
52
            </configuration>
53
            <!-- Path to PIG script the workflow executes. -->
54
            <script>lib/scripts/transformer.pig</script>
55
            <!-- The working directory of the workflow node. -->
56
            <param>workingDir=${workingDir}/transformer/working_dir</param>
57
            
58
            <param>input_extracted_document_metadata=${input_extracted_document_metadata}</param>
59
            <param>schema_input_extracted_document_metadata=eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</param>
60
            
61
            <param>input_citation=${input_citation}</param>
62
            <param>schema_input_citation=eu.dnetlib.iis.citationmatching.schemas.Citation</param>
63
            
64
            <param>input_document_to_project=${input_document_to_project}</param>
65
            <param>schema_input_document_to_project=eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject</param>
66
            
67
            <param>input_document_to_dataset=${input_document_to_dataset}</param>
68
            <param>schema_input_document_to_dataset=eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet</param>
69
            
70
            <param>input_document_to_research_initiative=${input_document_to_research_initiative}</param>
71
            <param>schema_input_document_to_research_initiative=eu.dnetlib.iis.referenceextraction.researchinitiative.schemas.DocumentToResearchInitiative</param>
72
            
73
            <param>input_document_to_document_clusters=${input_document_to_document_clusters}</param>
74
            <param>schema_input_document_to_document_clusters=eu.dnetlib.iis.documentsclustering.schemas.DocumentToDocumentClusters</param>
75
            
76
            <param>input_document_to_document_classes=${input_document_to_document_classes}</param>
77
            <param>schema_input_document_to_document_classes=eu.dnetlib.iis.documentsclassification.schemas.DocumentToDocumentClasses</param>
78
        
79
            <param>input_document_to_document_statistics=${input_document_to_document_statistics}</param>
80
            <param>schema_input_document_to_document_statistics=eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics</param>
81
        
82
            <param>input_document_with_website_usage_similarities=${input_document_with_website_usage_similarities}</param>
83
            <param>schema_input_document_with_website_usage_similarities=eu.dnetlib.iis.websiteusage.schemas.DocumentsWithWebsiteUsageSimilarities</param>
84
            
85
            <param>output_document_with_inferenced_data=${output_document_with_inferenced_data}</param>
86
            <param>schema_output_document_with_inferenced_data=eu.dnetlib.iis.export.schemas.DocumentWithInferencedData</param>
87
        </pig>
88
        <ok to="end"/>
89
        <error to="fail"/>
90
    </action>
91
    <kill name="fail">
92
		<message>Unfortunately, the workflow failed -- error message:
93
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
94
    </kill>
95
    <end name="end"/>
96
</workflow-app>

Also available in: Unified diff