Revision 33596
Added by Marek Horst almost 10 years ago
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/pom.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>icm-iis-parent-container</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
</parent> |
|
8 |
<modelVersion>4.0.0</modelVersion> |
|
9 |
<artifactId>icm-iis-statistics</artifactId> |
|
10 |
<packaging>jar</packaging> |
|
11 |
<version>1.0.0</version> |
|
12 |
|
|
13 |
<scm> |
|
14 |
<developerConnection> |
|
15 |
scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0 |
|
16 |
</developerConnection> |
|
17 |
</scm> |
|
18 |
|
|
19 |
<properties> |
|
20 |
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|
21 |
</properties> |
|
22 |
<dependencies> |
|
23 |
<dependency> |
|
24 |
<groupId>junit</groupId> |
|
25 |
<artifactId>junit</artifactId> |
|
26 |
<version>4.10</version> |
|
27 |
<scope>test</scope> |
|
28 |
</dependency> |
|
29 |
|
|
30 |
<dependency> |
|
31 |
<groupId>eu.dnetlib</groupId> |
|
32 |
<artifactId>icm-iis-core</artifactId> |
|
33 |
<version>1.0.0</version> |
|
34 |
</dependency> |
|
35 |
<dependency> |
|
36 |
<groupId>eu.dnetlib</groupId> |
|
37 |
<artifactId>icm-iis-core</artifactId> |
|
38 |
<version>1.0.0</version> |
|
39 |
<type>test-jar</type> |
|
40 |
<scope>test</scope> |
|
41 |
</dependency> |
|
42 |
<dependency> |
|
43 |
<groupId>eu.dnetlib</groupId> |
|
44 |
<artifactId>icm-iis-schemas</artifactId> |
|
45 |
<version>1.0.0</version> |
|
46 |
</dependency> |
|
47 |
<!-- required after introducing 'provided' scope for hadoop libs --> |
|
48 |
<dependency> |
|
49 |
<groupId>org.apache.hadoop</groupId> |
|
50 |
<artifactId>hadoop-common</artifactId> |
|
51 |
<version>${iis.hadoop.common.version}</version> |
|
52 |
<scope>provided</scope> |
|
53 |
</dependency> |
|
54 |
<!-- Needed by Oozie tests { --> |
|
55 |
<!-- required after introducing 'provided' scope for hadoop dependencies --> |
|
56 |
<dependency> |
|
57 |
<groupId>org.apache.oozie</groupId> |
|
58 |
<artifactId>oozie-core</artifactId> |
|
59 |
<version>${iis.oozie.version}</version> |
|
60 |
<scope>test</scope> |
|
61 |
</dependency> |
|
62 |
<dependency> |
|
63 |
<groupId>org.apache.hadoop</groupId> |
|
64 |
<artifactId>hadoop-hdfs</artifactId> |
|
65 |
<version>${iis.hadoop.hdfs.version}</version> |
|
66 |
<scope>test</scope> |
|
67 |
</dependency> |
|
68 |
<!-- end of required after introducing 'provided' scope for hadoop dependencies --> |
|
69 |
<dependency> |
|
70 |
<groupId>org.apache.oozie</groupId> |
|
71 |
<artifactId>oozie-core</artifactId> |
|
72 |
<version>${iis.oozie.version}</version> |
|
73 |
<type>test-jar</type> |
|
74 |
<scope>test</scope> |
|
75 |
</dependency> |
|
76 |
<dependency> |
|
77 |
<groupId>org.apache.hadoop</groupId> |
|
78 |
<artifactId>hadoop-hdfs</artifactId> |
|
79 |
<version>${iis.hadoop.hdfs.version}</version> |
|
80 |
<type>test-jar</type> |
|
81 |
<scope>test</scope> |
|
82 |
</dependency> |
|
83 |
<dependency> |
|
84 |
<groupId>org.apache.hadoop</groupId> |
|
85 |
<artifactId>hadoop-test</artifactId> |
|
86 |
<version>${iis.hadoop.test.version}</version> |
|
87 |
<scope>test</scope> |
|
88 |
</dependency> |
|
89 |
<dependency> |
|
90 |
<groupId>org.apache.hadoop</groupId> |
|
91 |
<artifactId>hadoop-common</artifactId> |
|
92 |
<version>${iis.hadoop.common.version}</version> |
|
93 |
<type>test-jar</type> |
|
94 |
<scope>test</scope> |
|
95 |
</dependency> |
|
96 |
|
|
97 |
<dependency> |
|
98 |
<groupId>org.apache.hive</groupId> |
|
99 |
<artifactId>hive-exec</artifactId> |
|
100 |
<version>${iis.hive.version}</version> |
|
101 |
</dependency> |
|
102 |
<dependency> |
|
103 |
<groupId>org.apache.hive</groupId> |
|
104 |
<artifactId>hive-cli</artifactId> |
|
105 |
<version>${iis.hive.version}</version> |
|
106 |
</dependency> |
|
107 |
<dependency> |
|
108 |
<groupId>org.apache.hive</groupId> |
|
109 |
<artifactId>hive-builtins</artifactId> |
|
110 |
<version>${iis.hive.version}</version> |
|
111 |
</dependency> |
|
112 |
</dependencies> |
|
113 |
<repositories> |
|
114 |
<!-- This repository contains our patched |
|
115 |
version of "avro" and "avro-mapred" modules (see the dependencies section) |
|
116 |
This entry might be removed when the patch to these modules becomes |
|
117 |
a part of the official Avro release.--> |
|
118 |
<repository> |
|
119 |
<id>dnet-deps</id> |
|
120 |
<name>dnet dependencies</name> |
|
121 |
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps</url> |
|
122 |
<releases> |
|
123 |
<enabled>true</enabled> |
|
124 |
</releases> |
|
125 |
<snapshots> |
|
126 |
<enabled>false</enabled> |
|
127 |
</snapshots> |
|
128 |
<layout>default</layout> |
|
129 |
</repository> |
|
130 |
</repositories> |
|
131 |
</project> |
|
0 | 132 |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/main/resources/eu/dnetlib/iis/statistics/main/oozie_app/lib/scripts/generator.q | ||
---|---|---|
1 |
CREATE EXTERNAL TABLE document |
|
2 |
COMMENT "A table backed by Avro data with the Avro schema stored in HDFS" |
|
3 |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' |
|
4 |
STORED AS |
|
5 |
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' |
|
6 |
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' |
|
7 |
LOCATION '${input_document_authors_citations}' |
|
8 |
TBLPROPERTIES ('avro.schema.literal'='${schema_input_document_authors_citations}'); |
|
9 |
|
|
10 |
CREATE EXTERNAL TABLE projectId |
|
11 |
COMMENT "A table backed by Avro data with the Avro schema stored in HDFS" |
|
12 |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' |
|
13 |
STORED AS |
|
14 |
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' |
|
15 |
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' |
|
16 |
LOCATION '${input_project_id}' |
|
17 |
TBLPROPERTIES ('avro.schema.literal'='${schema_input_project_id}'); |
|
18 |
|
|
19 |
CREATE EXTERNAL TABLE personId |
|
20 |
COMMENT "A table backed by Avro data with the Avro schema stored in HDFS" |
|
21 |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' |
|
22 |
STORED AS |
|
23 |
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' |
|
24 |
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' |
|
25 |
LOCATION '${input_person_id}' |
|
26 |
TBLPROPERTIES ('avro.schema.literal'='${schema_input_person_id}'); |
|
27 |
|
|
28 |
|
|
29 |
CREATE TABLE document_statistics |
|
30 |
COMMENT "A table backed by Avro data with the Avro schema stored in HDFS" |
|
31 |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' |
|
32 |
STORED AS |
|
33 |
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' |
|
34 |
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' |
|
35 |
LOCATION '${output_document_statistics}' |
|
36 |
TBLPROPERTIES ('avro.schema.literal'='${schema_output_document_statistics}'); |
|
37 |
|
|
38 |
CREATE TABLE author_statistics |
|
39 |
COMMENT "A table backed by Avro data with the Avro schema stored in HDFS" |
|
40 |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' |
|
41 |
STORED AS |
|
42 |
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' |
|
43 |
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' |
|
44 |
LOCATION '${output_author_statistics}' |
|
45 |
TBLPROPERTIES ('avro.schema.literal'='${schema_output_author_statistics}'); |
|
46 |
|
|
47 |
CREATE TABLE project_statistics |
|
48 |
COMMENT "A table backed by Avro data with the Avro schema stored in HDFS" |
|
49 |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' |
|
50 |
STORED AS |
|
51 |
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' |
|
52 |
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' |
|
53 |
LOCATION '${output_project_statistics}' |
|
54 |
TBLPROPERTIES ('avro.schema.literal'='${schema_output_project_statistics}'); |
|
55 |
|
|
56 |
CREATE TABLE global_statistics |
|
57 |
COMMENT "A table backed by Avro data with the Avro schema stored in HDFS" |
|
58 |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' |
|
59 |
STORED AS |
|
60 |
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' |
|
61 |
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' |
|
62 |
LOCATION '${output_global_statistics}' |
|
63 |
TBLPROPERTIES ('avro.schema.literal'='${schema_output_global_statistics}'); |
|
64 |
|
|
65 |
-- UDFs |
|
66 |
|
|
67 |
CREATE TEMPORARY FUNCTION collect_all AS 'eu.dnetlib.iis.core.hive.CollectAllUDAF'; |
|
68 |
CREATE TEMPORARY FUNCTION count_array AS 'eu.dnetlib.iis.core.hive.CountArrayElementsUDF'; |
|
69 |
CREATE TEMPORARY FUNCTION empty_array AS 'eu.dnetlib.iis.core.hive.CreateEmptyArrayUDF'; |
|
70 |
CREATE TEMPORARY FUNCTION merge_maps AS 'eu.dnetlib.iis.core.hive.MergeStringIntMapsUDAF'; |
|
71 |
CREATE TEMPORARY FUNCTION list_to_map AS 'eu.dnetlib.iis.core.hive.CountArrayElementsUDAF'; |
|
72 |
CREATE TEMPORARY FUNCTION gen_coauthors AS 'eu.dnetlib.iis.statistics.hive.GenerateCoauthorsUDF'; |
|
73 |
|
|
74 |
-- common tables |
|
75 |
|
|
76 |
create table citations |
|
77 |
location '${workingDir}/citations' |
|
78 |
as |
|
79 |
select documentId, year, refDocId, isPublished |
|
80 |
from document lateral view explode(referencedDocumentsIds) ids AS refDocId; |
|
81 |
|
|
82 |
create table citationsWithNulls |
|
83 |
location '${workingDir}/citationsWithNulls' |
|
84 |
as |
|
85 |
select citations.documentId as sourceId, citations.isPublished as sourcePublished, citations.year, document.documentId as targetId, document.isPublished as targetPublished from |
|
86 |
document left outer join citations |
|
87 |
on document.documentId = citations.refDocId; |
|
88 |
|
|
89 |
create table publishedSourceCitations |
|
90 |
location '${workingDir}/publishedSourceCitations' |
|
91 |
as |
|
92 |
select * from citationsWithNulls where sourcePublished or sourceId is null; |
|
93 |
|
|
94 |
create table docStats |
|
95 |
location '${workingDir}/docStats' |
|
96 |
as |
|
97 |
select |
|
98 |
targetId as docId, |
|
99 |
collect_all(targetPublished)[0] as published, |
|
100 |
cast(count(sourceId) as INT) as numberOfCitations, |
|
101 |
count_array(collect_all(year)) as numberOfCitationsPerYear, |
|
102 |
map("1", if(count(sourceId) >= 1, 1, 0), |
|
103 |
"10", if(count(sourceId) >= 10, 1, 0), |
|
104 |
"50", if(count(sourceId) >= 50, 1, 0), |
|
105 |
"100", if(count(sourceId) >= 100, 1, 0), |
|
106 |
"250", if(count(sourceId) >= 250, 1, 0), |
|
107 |
"500", if(count(sourceId) >= 500, 1, 0)) as numberOfPapersCitedAtLeastXTimes |
|
108 |
from citationsWithNulls group by targetId; |
|
109 |
|
|
110 |
create table publishedSourceDocStats |
|
111 |
location '${workingDir}/publishedSourceDocStats' |
|
112 |
as |
|
113 |
select |
|
114 |
targetId as docId, |
|
115 |
collect_all(targetPublished)[0] as published, |
|
116 |
cast(count(sourceId) as INT) as numberOfCitations, |
|
117 |
count_array(collect_all(year)) as numberOfCitationsPerYear, |
|
118 |
map("1", if(count(sourceId) >= 1, 1, 0), |
|
119 |
"10", if(count(sourceId) >= 10, 1, 0), |
|
120 |
"50", if(count(sourceId) >= 50, 1, 0), |
|
121 |
"100", if(count(sourceId) >= 100, 1, 0), |
|
122 |
"250", if(count(sourceId) >= 250, 1, 0), |
|
123 |
"500", if(count(sourceId) >= 500, 1, 0)) as numberOfPapersCitedAtLeastXTimes |
|
124 |
from publishedSourceCitations group by targetId; |
|
125 |
|
|
126 |
create table allDocStatistics |
|
127 |
location '${workingDir}/allDocStatistics' |
|
128 |
as |
|
129 |
select |
|
130 |
docStats.docId, |
|
131 |
docStats.published, |
|
132 |
docStats.numberOfCitations, |
|
133 |
docStats.numberOfCitationsPerYear, |
|
134 |
docStats.numberOfPapersCitedAtLeastXTimes, |
|
135 |
coalesce(publishedSourceDocStats.numberOfCitations, 0) as numberOfPublishedCitations, |
|
136 |
coalesce(publishedSourceDocStats.numberOfCitationsPerYear, map('unknown', 0)) as numberOfPublishedCitationsPerYear, |
|
137 |
coalesce(publishedSourceDocStats.numberOfPapersCitedAtLeastXTimes, map("1", 0, "10", 0, "50", 0, "100", 0, "250", 0, "500", 0)) as numberOfPapersCitedAtLeastXTimesByPublished |
|
138 |
from docStats left outer join publishedSourceDocStats |
|
139 |
on docStats.docId = publishedSourceDocStats.docId; |
|
140 |
|
|
141 |
create table publishedDocStatistics |
|
142 |
location '${workingDir}/publishedDocStatistics' |
|
143 |
as |
|
144 |
select * from allDocStatistics |
|
145 |
where published; |
|
146 |
|
|
147 |
|
|
148 |
-- document statistics |
|
149 |
|
|
150 |
insert overwrite table document_statistics |
|
151 |
select |
|
152 |
docId as documentId, |
|
153 |
named_struct( |
|
154 |
"citationsFromAllPapers", named_struct( |
|
155 |
"numberOfCitations", numberOfCitations, |
|
156 |
"numberOfCitationsPerYear", numberOfCitationsPerYear), |
|
157 |
"citationsFromPublishedPapers", named_struct( |
|
158 |
"numberOfCitations", numberOfPublishedCitations, |
|
159 |
"numberOfCitationsPerYear", numberOfPublishedCitationsPerYear) |
|
160 |
) as statistic |
|
161 |
from allDocStatistics; |
|
162 |
|
|
163 |
|
|
164 |
-- global statistics |
|
165 |
|
|
166 |
create table globalAll |
|
167 |
location '${workingDir}/globalAll' |
|
168 |
as |
|
169 |
select |
|
170 |
named_struct( |
|
171 |
'numberOfPapers', cast(count(docId) as INT), |
|
172 |
'citationsFromAllPapers', |
|
173 |
named_struct( |
|
174 |
'basic', |
|
175 |
named_struct( |
|
176 |
'numberOfCitations', cast(sum(numberOfCitations) as INT), |
|
177 |
'numberOfCitationsPerYear', merge_maps(numberOfCitationsPerYear) |
|
178 |
), |
|
179 |
'averageNumberOfCitationsPerPaper', cast(avg(numberOfCitations) as FLOAT), |
|
180 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimes) |
|
181 |
), |
|
182 |
'citationsFromPublishedPapers', |
|
183 |
named_struct( |
|
184 |
'basic', |
|
185 |
named_struct( |
|
186 |
'numberOfCitations', cast(sum(numberOfPublishedCitations) as INT), |
|
187 |
'numberOfCitationsPerYear', merge_maps(numberOfPublishedCitationsPerYear) |
|
188 |
), |
|
189 |
'averageNumberOfCitationsPerPaper', cast(avg(numberOfPublishedCitations) as FLOAT), |
|
190 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimesByPublished) |
|
191 |
) |
|
192 |
) as allPapers |
|
193 |
from allDocStatistics; |
|
194 |
|
|
195 |
create table globalPublished |
|
196 |
location '${workingDir}/globalPublished' |
|
197 |
as |
|
198 |
select |
|
199 |
named_struct( |
|
200 |
'numberOfPapers', cast(count(docId) as INT), |
|
201 |
'citationsFromAllPapers', |
|
202 |
named_struct( |
|
203 |
'basic', |
|
204 |
named_struct( |
|
205 |
'numberOfCitations', cast(sum(numberOfCitations) as INT), |
|
206 |
'numberOfCitationsPerYear', merge_maps(numberOfCitationsPerYear) |
|
207 |
), |
|
208 |
'averageNumberOfCitationsPerPaper', cast(avg(numberOfCitations) as FLOAT), |
|
209 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimes) |
|
210 |
), |
|
211 |
'citationsFromPublishedPapers', |
|
212 |
named_struct( |
|
213 |
'basic', |
|
214 |
named_struct( |
|
215 |
'numberOfCitations', cast(sum(numberOfPublishedCitations) as INT), |
|
216 |
'numberOfCitationsPerYear', merge_maps(numberOfPublishedCitationsPerYear) |
|
217 |
), |
|
218 |
'averageNumberOfCitationsPerPaper', cast(avg(numberOfPublishedCitations) as FLOAT), |
|
219 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimesByPublished) |
|
220 |
) |
|
221 |
) as publishedPapers |
|
222 |
from publishedDocStatistics; |
|
223 |
|
|
224 |
insert overwrite table global_statistics |
|
225 |
select globalAll.allPapers, globalPublished.publishedPapers |
|
226 |
from globalAll join globalPublished; |
|
227 |
|
|
228 |
|
|
229 |
-- project statistics |
|
230 |
|
|
231 |
create table projectDocument |
|
232 |
location '${workingDir}/projectDocument' |
|
233 |
as |
|
234 |
select projectId.id as projId, projDoc.documentId from |
|
235 |
projectId left outer join |
|
236 |
(select documentId, projectId |
|
237 |
from document lateral view explode(projectIds) ids as projectId) projDoc |
|
238 |
on projectId.id = projDoc.projectId; |
|
239 |
|
|
240 |
create table projectAll |
|
241 |
location '${workingDir}/projectAll' |
|
242 |
as |
|
243 |
select |
|
244 |
projId, |
|
245 |
named_struct( |
|
246 |
'numberOfPapers', cast(count(docId) as INT), |
|
247 |
'citationsFromAllPapers', |
|
248 |
named_struct( |
|
249 |
'basic', |
|
250 |
named_struct( |
|
251 |
'numberOfCitations', coalesce(cast(sum(numberOfCitations) as INT), 0), |
|
252 |
'numberOfCitationsPerYear', merge_maps(numberOfCitationsPerYear) |
|
253 |
), |
|
254 |
'averageNumberOfCitationsPerPaper', coalesce(cast(avg(numberOfCitations) as FLOAT), 0), |
|
255 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimes) |
|
256 |
), |
|
257 |
'citationsFromPublishedPapers', |
|
258 |
named_struct( |
|
259 |
'basic', |
|
260 |
named_struct( |
|
261 |
'numberOfCitations', coalesce(cast(sum(numberOfPublishedCitations) as INT), 0), |
|
262 |
'numberOfCitationsPerYear', merge_maps(numberOfPublishedCitationsPerYear) |
|
263 |
), |
|
264 |
'averageNumberOfCitationsPerPaper', coalesce(cast(avg(numberOfPublishedCitations) as FLOAT), 0), |
|
265 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimesByPublished) |
|
266 |
) |
|
267 |
) as allPapers |
|
268 |
from |
|
269 |
(select * from |
|
270 |
projectDocument left outer join allDocStatistics |
|
271 |
on projectDocument.documentId = allDocStatistics.docId) merged |
|
272 |
group by projId; |
|
273 |
|
|
274 |
create table projectPublished |
|
275 |
location '${workingDir}/projectPublished' |
|
276 |
as |
|
277 |
select |
|
278 |
projId, |
|
279 |
named_struct( |
|
280 |
'numberOfPapers', cast(count(docId) as INT), |
|
281 |
'citationsFromAllPapers', |
|
282 |
named_struct( |
|
283 |
'basic', |
|
284 |
named_struct( |
|
285 |
'numberOfCitations', coalesce(cast(sum(numberOfCitations) as INT), 0), |
|
286 |
'numberOfCitationsPerYear', merge_maps(numberOfCitationsPerYear) |
|
287 |
), |
|
288 |
'averageNumberOfCitationsPerPaper', coalesce(cast(avg(numberOfCitations) as FLOAT), 0), |
|
289 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimes) |
|
290 |
), |
|
291 |
'citationsFromPublishedPapers', |
|
292 |
named_struct( |
|
293 |
'basic', |
|
294 |
named_struct( |
|
295 |
'numberOfCitations', coalesce(cast(sum(numberOfPublishedCitations) as INT), 0), |
|
296 |
'numberOfCitationsPerYear', merge_maps(numberOfPublishedCitationsPerYear) |
|
297 |
), |
|
298 |
'averageNumberOfCitationsPerPaper', coalesce(cast(avg(numberOfPublishedCitations) as FLOAT), 0), |
|
299 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimesByPublished) |
|
300 |
) |
|
301 |
) as publishedPapers |
|
302 |
from |
|
303 |
(select * from |
|
304 |
projectDocument left outer join publishedDocStatistics |
|
305 |
on projectDocument.documentId = publishedDocStatistics.docId) merged |
|
306 |
group by projId; |
|
307 |
|
|
308 |
insert overwrite table project_statistics |
|
309 |
select |
|
310 |
projectAll.projId as projectId, |
|
311 |
named_struct( |
|
312 |
'allPapers', projectAll.allPapers, |
|
313 |
'publishedPapers', projectPublished.publishedPapers |
|
314 |
) as statistic |
|
315 |
from projectAll join projectPublished |
|
316 |
on projectAll.projId = projectPublished.projId; |
|
317 |
|
|
318 |
|
|
319 |
-- author stats |
|
320 |
|
|
321 |
create table authorDocument |
|
322 |
location '${workingDir}/authorDocument' |
|
323 |
as |
|
324 |
select personId.id as authorId, authDoc.documentId from |
|
325 |
personId left outer join |
|
326 |
(select documentId, authorId |
|
327 |
from document lateral view explode(authorIds) ids as authorId) authDoc |
|
328 |
on personId.id = authDoc.authorId; |
|
329 |
|
|
330 |
create table authorAll |
|
331 |
location '${workingDir}/authorAll' |
|
332 |
as |
|
333 |
select |
|
334 |
authorId, |
|
335 |
named_struct( |
|
336 |
'numberOfPapers', cast(count(docId) as INT), |
|
337 |
'citationsFromAllPapers', |
|
338 |
named_struct( |
|
339 |
'basic', |
|
340 |
named_struct( |
|
341 |
'numberOfCitations', coalesce(cast(sum(numberOfCitations) as INT), 0), |
|
342 |
'numberOfCitationsPerYear', merge_maps(numberOfCitationsPerYear) |
|
343 |
), |
|
344 |
'averageNumberOfCitationsPerPaper', coalesce(cast(avg(numberOfCitations) as FLOAT), 0), |
|
345 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimes) |
|
346 |
), |
|
347 |
'citationsFromPublishedPapers', |
|
348 |
named_struct( |
|
349 |
'basic', |
|
350 |
named_struct( |
|
351 |
'numberOfCitations', coalesce(cast(sum(numberOfPublishedCitations) as INT), 0), |
|
352 |
'numberOfCitationsPerYear', merge_maps(numberOfPublishedCitationsPerYear) |
|
353 |
), |
|
354 |
'averageNumberOfCitationsPerPaper', coalesce(cast(avg(numberOfPublishedCitations) as FLOAT), 0), |
|
355 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimesByPublished) |
|
356 |
) |
|
357 |
) as allPapers |
|
358 |
from |
|
359 |
(select * from |
|
360 |
authorDocument left outer join allDocStatistics |
|
361 |
on authorDocument.documentId = allDocStatistics.docId) merged |
|
362 |
group by authorId; |
|
363 |
|
|
364 |
create table authorPublished |
|
365 |
location '${workingDir}/authorPublished' |
|
366 |
as |
|
367 |
select |
|
368 |
authorId, |
|
369 |
named_struct( |
|
370 |
'numberOfPapers', cast(count(docId) as INT), |
|
371 |
'citationsFromAllPapers', |
|
372 |
named_struct( |
|
373 |
'basic', |
|
374 |
named_struct( |
|
375 |
'numberOfCitations', coalesce(cast(sum(numberOfCitations) as INT), 0), |
|
376 |
'numberOfCitationsPerYear', merge_maps(numberOfCitationsPerYear) |
|
377 |
), |
|
378 |
'averageNumberOfCitationsPerPaper', coalesce(cast(avg(numberOfCitations) as FLOAT), 0), |
|
379 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimes) |
|
380 |
), |
|
381 |
'citationsFromPublishedPapers', |
|
382 |
named_struct( |
|
383 |
'basic', |
|
384 |
named_struct( |
|
385 |
'numberOfCitations', coalesce(cast(sum(numberOfPublishedCitations) as INT), 0), |
|
386 |
'numberOfCitationsPerYear', merge_maps(numberOfPublishedCitationsPerYear) |
|
387 |
), |
|
388 |
'averageNumberOfCitationsPerPaper', coalesce(cast(avg(numberOfPublishedCitations) as FLOAT), 0), |
|
389 |
'numberOfPapersCitedAtLeastXTimes', merge_maps(numberOfPapersCitedAtLeastXTimesByPublished) |
|
390 |
) |
|
391 |
) as publishedPapers |
|
392 |
from |
|
393 |
(select * from |
|
394 |
authorDocument left outer join publishedDocStatistics |
|
395 |
on authorDocument.documentId = publishedDocStatistics.docId) merged |
|
396 |
group by authorId; |
|
397 |
|
|
398 |
create table docCoauthors |
|
399 |
location '${workingDir}/docCoauthors' |
|
400 |
as |
|
401 |
select documentId, authorId, authorIds |
|
402 |
from document lateral view explode(authorIds) ids as authorId; |
|
403 |
|
|
404 |
create table coAuthorsMap |
|
405 |
location '${workingDir}/coauthorstmp' |
|
406 |
as |
|
407 |
select authorId, list_to_map(authorIds) as coAuthorsMap |
|
408 |
from docCoauthors |
|
409 |
group by authorId; |
|
410 |
|
|
411 |
create table coauthors |
|
412 |
location '${workingDir}/coauthors' |
|
413 |
as |
|
414 |
select authorId, gen_coauthors(authorId, coAuthorsMap) as coAuthors |
|
415 |
from coAuthorsMap; |
|
416 |
|
|
417 |
|
|
418 |
create table coauthorsFull |
|
419 |
location '${workingDir}/coauthorsFull' |
|
420 |
as |
|
421 |
select personId.id as authorId, coalesce(coauthors.coAuthors, empty_array(named_struct("id", "id", "coauthoredPapersCount", 0))) as coAuthor from |
|
422 |
personId left outer join coauthors |
|
423 |
on personId.id = coauthors.authorId; |
|
424 |
|
|
425 |
insert overwrite table author_statistics |
|
426 |
select |
|
427 |
authorAll.authorId, named_struct( |
|
428 |
"core", named_struct( |
|
429 |
'allPapers', authorAll.allPapers, |
|
430 |
'publishedPapers', authorPublished.publishedPapers), |
|
431 |
"coAuthors", coauthorsFull.coAuthor |
|
432 |
) as statistic |
|
433 |
from authorAll join authorPublished |
|
434 |
on authorAll.authorId = authorPublished.authorId |
|
435 |
join coauthorsFull on authorPublished.authorId = coauthorsFull.authorId; |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/main/resources/eu/dnetlib/iis/statistics/main/oozie_app/workflow.xml | ||
---|---|---|
1 |
<workflow-app xmlns="uri:oozie:workflow:0.4" name="statistics_main"> |
|
2 |
|
|
3 |
<parameters> |
|
4 |
<property> |
|
5 |
<name>input_document_authors_citations</name> |
|
6 |
<description>input document with authors and citation</description> |
|
7 |
</property> |
|
8 |
<property> |
|
9 |
<name>input_person_id</name> |
|
10 |
<description>input person id</description> |
|
11 |
</property> |
|
12 |
<property> |
|
13 |
<name>input_project_id</name> |
|
14 |
<description>output project id</description> |
|
15 |
</property> |
|
16 |
<property> |
|
17 |
<name>output_document_statistics</name> |
|
18 |
<description>output document statistics</description> |
|
19 |
</property> |
|
20 |
<property> |
|
21 |
<name>output_author_statistics</name> |
|
22 |
<description>output author statistics</description> |
|
23 |
</property> |
|
24 |
<property> |
|
25 |
<name>output_project_statistics</name> |
|
26 |
<description>output project statistics</description> |
|
27 |
</property> |
|
28 |
<property> |
|
29 |
<name>output_global_statistics</name> |
|
30 |
<description>output global statistics</description> |
|
31 |
</property> |
|
32 |
</parameters> |
|
33 |
|
|
34 |
<global> |
|
35 |
<job-tracker>${jobTracker}</job-tracker> |
|
36 |
<name-node>${nameNode}</name-node> |
|
37 |
<configuration> |
|
38 |
<property> |
|
39 |
<name>mapred.job.queue.name</name> |
|
40 |
<value>${queueName}</value> |
|
41 |
</property> |
|
42 |
</configuration> |
|
43 |
</global> |
|
44 |
|
|
45 |
<start to="generate-schema"/> |
|
46 |
|
|
47 |
<action name="generate-schema"> |
|
48 |
<java> |
|
49 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
50 |
<arg>eu.dnetlib.iis.statistics.schemas.DocumentWithAuthorsAndCitations</arg> |
|
51 |
<arg>eu.dnetlib.iis.statistics.schemas.PersonId</arg> |
|
52 |
<arg>eu.dnetlib.iis.statistics.schemas.ProjectId</arg> |
|
53 |
<arg>eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics</arg> |
|
54 |
<arg>eu.dnetlib.iis.statistics.schemas.AuthorToAuthorStatistics</arg> |
|
55 |
<arg>eu.dnetlib.iis.statistics.schemas.ProjectToProjectStatistics</arg> |
|
56 |
<arg>eu.dnetlib.iis.statistics.schemas.CommonCoreStatistics</arg> |
|
57 |
<capture-output /> |
|
58 |
</java> |
|
59 |
<ok to="generator" /> |
|
60 |
<error to="fail" /> |
|
61 |
</action> |
|
62 |
|
|
63 |
<action name="generator"> |
|
64 |
<hive xmlns="uri:oozie:hive-action:0.2"> |
|
65 |
<job-tracker>${jobTracker}</job-tracker> |
|
66 |
<name-node>${nameNode}</name-node> |
|
67 |
<prepare> |
|
68 |
<delete path="${nameNode}${workingDir}/generator" /> |
|
69 |
<mkdir path="${nameNode}${workingDir}/generator" /> |
|
70 |
<mkdir path="${nameNode}${workingDir}/generator/working_dir" /> |
|
71 |
</prepare> |
|
72 |
<configuration> |
|
73 |
<property> |
|
74 |
<name>mapred.job.queue.name</name> |
|
75 |
<value>${queueName}</value> |
|
76 |
</property> |
|
77 |
<property> |
|
78 |
<name>oozie.hive.defaults</name> |
|
79 |
<value>hive-site.xml</value> |
|
80 |
</property> |
|
81 |
<property> |
|
82 |
<name>hive.exec.scratchdir</name> |
|
83 |
<value>/tmp/hive-${wf:user()}</value> |
|
84 |
</property> |
|
85 |
<property> |
|
86 |
<name>mapred.reduce.tasks</name> |
|
87 |
<value>50</value> |
|
88 |
</property> |
|
89 |
<property> |
|
90 |
<name>mapred.child.java.opts</name> |
|
91 |
<value>-Xmx2048M</value> |
|
92 |
</property> |
|
93 |
</configuration> |
|
94 |
<script>lib/scripts/generator.q</script> |
|
95 |
<!-- The working directory of the workflow node. --> |
|
96 |
<param>workingDir=${workingDir}/generator/working_dir</param> |
|
97 |
|
|
98 |
<param>input_document_authors_citations=${input_document_authors_citations}</param> |
|
99 |
<param>schema_input_document_authors_citations=${wf:actionData('generate-schema')['eu.dnetlib.iis.statistics.schemas.DocumentWithAuthorsAndCitations']}</param> |
|
100 |
|
|
101 |
<param>input_person_id=${input_person_id}</param> |
|
102 |
<param>schema_input_person_id=${wf:actionData('generate-schema')['eu.dnetlib.iis.statistics.schemas.PersonId']}</param> |
|
103 |
|
|
104 |
<param>input_project_id=${input_project_id}</param> |
|
105 |
<param>schema_input_project_id=${wf:actionData('generate-schema')['eu.dnetlib.iis.statistics.schemas.ProjectId']}</param> |
|
106 |
|
|
107 |
<param>output_document_statistics=${output_document_statistics}</param> |
|
108 |
<param>schema_output_document_statistics=${wf:actionData('generate-schema')['eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics']}</param> |
|
109 |
|
|
110 |
<param>output_author_statistics=${output_author_statistics}</param> |
|
111 |
<param>schema_output_author_statistics=${wf:actionData('generate-schema')['eu.dnetlib.iis.statistics.schemas.AuthorToAuthorStatistics']}</param> |
|
112 |
|
|
113 |
<param>output_project_statistics=${output_project_statistics}</param> |
|
114 |
<param>schema_output_project_statistics=${wf:actionData('generate-schema')['eu.dnetlib.iis.statistics.schemas.ProjectToProjectStatistics']}</param> |
|
115 |
|
|
116 |
<param>output_global_statistics=${output_global_statistics}</param> |
|
117 |
<param>schema_output_global_statistics=${wf:actionData('generate-schema')['eu.dnetlib.iis.statistics.schemas.CommonCoreStatistics']}</param> |
|
118 |
</hive> |
|
119 |
<ok to="end"/> |
|
120 |
<error to="fail"/> |
|
121 |
</action> |
|
122 |
<kill name="fail"> |
|
123 |
<message>Unfortunately, the workflow failed -- error message: |
|
124 |
[${wf:errorMessage(wf:lastErrorNode())}]</message> |
|
125 |
</kill> |
|
126 |
<end name="end"/> |
|
127 |
</workflow-app> |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/main/resources/eu/dnetlib/iis/statistics/main/job.properties | ||
---|---|---|
1 |
input_document_authors_citations=/share/transformers/statistics/document_authors_citations/2014-01-21 |
|
2 |
input_person_id=/share/transformers/statistics/person_id/2014-01-21 |
|
3 |
input_project_id=/share/transformers/statistics/project_id/2014-01-21 |
|
4 |
output_document_statistics=${workingDir}/document_statistics |
|
5 |
output_author_statistics=${workingDir}/author_statistics |
|
6 |
output_project_statistics=${workingDir}/project_statistics |
|
7 |
output_global_statistics=${workingDir}/global_statistics |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/main/java/eu/dnetlib/iis/statistics/hive/GenerateCoauthorsUDF.java | ||
---|---|---|
1 |
package eu.dnetlib.iis.statistics.hive; |
|
2 |
|
|
3 |
import java.util.ArrayList; |
|
4 |
import java.util.Arrays; |
|
5 |
import java.util.List; |
|
6 |
import java.util.Map; |
|
7 |
import org.apache.hadoop.hive.ql.exec.UDFArgumentException; |
|
8 |
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; |
|
9 |
import org.apache.hadoop.hive.ql.metadata.HiveException; |
|
10 |
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; |
|
11 |
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; |
|
12 |
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; |
|
13 |
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; |
|
14 |
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; |
|
15 |
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; |
|
16 |
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; |
|
17 |
import org.apache.hadoop.io.IntWritable; |
|
18 |
import org.apache.hadoop.io.Text; |
|
19 |
|
|
20 |
/** |
|
21 |
* |
|
22 |
* @author Dominika Tkaczyk |
|
23 |
*/ |
|
24 |
public class GenerateCoauthorsUDF extends GenericUDF { |
|
25 |
|
|
26 |
private StringObjectInspector authorIdOI; |
|
27 |
private MapObjectInspector mapOI; |
|
28 |
private StringObjectInspector mapKeyOI; |
|
29 |
private IntObjectInspector mapValueOI; |
|
30 |
|
|
31 |
@Override |
|
32 |
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { |
|
33 |
if (arguments.length != 2) { |
|
34 |
throw new UDFArgumentLengthException("GenerateCoauthorsUDF takes 2 argument: string, map<string, int>"); |
|
35 |
} |
|
36 |
|
|
37 |
if (!(arguments[0] instanceof StringObjectInspector)) { |
|
38 |
throw new UDFArgumentException("The argument must be a list"); |
|
39 |
} |
|
40 |
|
|
41 |
authorIdOI = (StringObjectInspector) arguments[0]; |
|
42 |
|
|
43 |
if (!(arguments[1] instanceof MapObjectInspector)) { |
|
44 |
throw new UDFArgumentException("The argument must be a list"); |
|
45 |
} |
|
46 |
|
|
47 |
mapOI = (MapObjectInspector) arguments[1]; |
|
48 |
|
|
49 |
if (!(mapOI.getMapKeyObjectInspector() instanceof StringObjectInspector)) { |
|
50 |
throw new UDFArgumentException("The argument must be a list"); |
|
51 |
} |
|
52 |
|
|
53 |
mapKeyOI = (StringObjectInspector) mapOI.getMapKeyObjectInspector(); |
|
54 |
|
|
55 |
if (!(mapOI.getMapValueObjectInspector() instanceof IntObjectInspector)) { |
|
56 |
throw new UDFArgumentException("The argument must be a list"); |
|
57 |
} |
|
58 |
|
|
59 |
mapValueOI = (IntObjectInspector) mapOI.getMapValueObjectInspector(); |
|
60 |
|
|
61 |
List names = Arrays.asList("id", "coauthoredPapersCount"); |
|
62 |
List ois = Arrays.asList( |
|
63 |
PrimitiveObjectInspectorFactory.writableStringObjectInspector, |
|
64 |
PrimitiveObjectInspectorFactory.writableIntObjectInspector); |
|
65 |
|
|
66 |
return ObjectInspectorFactory.getStandardListObjectInspector( |
|
67 |
ObjectInspectorFactory.getStandardStructObjectInspector(names, ois)); |
|
68 |
} |
|
69 |
|
|
70 |
@Override |
|
71 |
public Object evaluate(DeferredObject[] arguments) throws HiveException { |
|
72 |
List coauthors = new ArrayList(); |
|
73 |
String key = authorIdOI.getPrimitiveJavaObject(arguments[0].get()); |
|
74 |
Map map = mapOI.getMap(arguments[1].get()); |
|
75 |
if (!map.isEmpty()) { |
|
76 |
for (Object entry : map.entrySet()) { |
|
77 |
Map.Entry mapEntry = (Map.Entry<Object, Object>) entry; |
|
78 |
String id = mapKeyOI.getPrimitiveJavaObject(mapEntry.getKey()); |
|
79 |
if (!id.equals(key)) { |
|
80 |
int count = mapValueOI.get(mapEntry.getValue()); |
|
81 |
Object[] coauthor = new Object[2]; |
|
82 |
coauthor[0] = new Text(id); |
|
83 |
coauthor[1] = new IntWritable(count); |
|
84 |
coauthors.add(coauthor); |
|
85 |
} |
|
86 |
} |
|
87 |
} |
|
88 |
|
|
89 |
return coauthors; |
|
90 |
} |
|
91 |
|
|
92 |
@Override |
|
93 |
public String getDisplayString(String[] strings) { |
|
94 |
return "GenerateCoauthors()"; |
|
95 |
} |
|
96 |
|
|
97 |
} |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/test/resources/eu/dnetlib/iis/statistics/main/sampledataproducer/data/author_to_author_statistics.json | ||
---|---|---|
1 |
{"authorId": "id-1", "statistics": {"core": {"allPapers": {"numberOfPapers": 2, "citationsFromAllPapers": {"basic": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "averageNumberOfCitationsPerPaper": 1.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}, "averageNumberOfCitationsPerPaper": 0.5, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}}, "coAuthors": [{"id": "id-3", "coauthoredPapersCount": 1}, {"id": "id-123", "coauthoredPapersCount": 2}, {"id": "id-2", "coauthoredPapersCount": 1}, {"id": "id-800", "coauthoredPapersCount": 1}]}} |
|
2 |
{"authorId": "id-123", "statistics": {"core": {"allPapers": {"numberOfPapers": 2, "citationsFromAllPapers": {"basic": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "averageNumberOfCitationsPerPaper": 1.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}, "averageNumberOfCitationsPerPaper": 0.5, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}}, "coAuthors": [{"id": "id-3", "coauthoredPapersCount": 1}, {"id": "id-2", "coauthoredPapersCount": 1}, {"id": "id-1", "coauthoredPapersCount": 2}, {"id": "id-800", "coauthoredPapersCount": 1}]}} |
|
3 |
{"authorId": "id-2", "statistics": {"core": {"allPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}}, "coAuthors": [{"id": "id-123", "coauthoredPapersCount": 1}, {"id": "id-1", "coauthoredPapersCount": 1}, {"id": "id-800", "coauthoredPapersCount": 1}]}} |
|
4 |
{"authorId": "id-3", "statistics": {"core": {"allPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "averageNumberOfCitationsPerPaper": 2.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}, "averageNumberOfCitationsPerPaper": 1.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 0, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}}}, "coAuthors": [{"id": "id-123", "coauthoredPapersCount": 1}, {"id": "id-1", "coauthoredPapersCount": 1}]}} |
|
5 |
{"authorId": "id-345", "statistics": {"core": {"allPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "averageNumberOfCitationsPerPaper": 2.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}, "averageNumberOfCitationsPerPaper": 1.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "averageNumberOfCitationsPerPaper": 2.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}, "averageNumberOfCitationsPerPaper": 1.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}}}, "coAuthors": []}} |
|
6 |
{"authorId": "id-590", "statistics": {"core": {"allPapers": {"numberOfPapers": 0, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}}, "publishedPapers": {"numberOfPapers": 0, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}}}, "coAuthors": []}} |
|
7 |
{"authorId": "id-800", "statistics": {"core": {"allPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}}, "coAuthors": [{"id": "id-123", "coauthoredPapersCount": 1}, {"id": "id-2", "coauthoredPapersCount": 1}, {"id": "id-1", "coauthoredPapersCount": 1}]}} |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/test/resources/eu/dnetlib/iis/statistics/main/sampledataproducer/data/document_to_document_statistics.json | ||
---|---|---|
1 |
{"documentId": "id-1", "statistics": {"citationsFromAllPapers": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "citationsFromPublishedPapers": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}}} |
|
2 |
{"documentId": "id-2", "statistics": {"citationsFromAllPapers": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "citationsFromPublishedPapers": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}}} |
|
3 |
{"documentId": "id-3", "statistics": {"citationsFromAllPapers": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "citationsFromPublishedPapers": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}}} |
|
4 |
{"documentId": "id-4", "statistics": {"citationsFromAllPapers": {"numberOfCitations": 3, "numberOfCitationsPerYear": {"2010": 1, "2001": 2}}, "citationsFromPublishedPapers": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2010": 1, "2001": 1}}}} |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/test/resources/eu/dnetlib/iis/statistics/main/sampledataproducer/data/project_to_project_statistics.json | ||
---|---|---|
1 |
{"projectId": "1", "statistics": {"allPapers": {"numberOfPapers": 0, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}}, "publishedPapers": {"numberOfPapers": 0, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}}}} |
|
2 |
{"projectId": "2", "statistics": {"allPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}}} |
|
3 |
{"projectId": "3", "statistics": {"allPapers": {"numberOfPapers": 0, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}}, "publishedPapers": {"numberOfPapers": 0, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {}}}}} |
|
4 |
{"projectId": "4", "statistics": {"allPapers": {"numberOfPapers": 2, "citationsFromAllPapers": {"basic": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "averageNumberOfCitationsPerPaper": 1.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}, "averageNumberOfCitationsPerPaper": 0.5, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}}} |
|
5 |
{"projectId": "7", "statistics": {"allPapers": {"numberOfPapers": 3, "citationsFromAllPapers": {"basic": {"numberOfCitations": 5, "numberOfCitationsPerYear": {"2010": 1, "2001": 4}}, "averageNumberOfCitationsPerPaper": 1.6666666, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 2, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 3, "numberOfCitationsPerYear": {"2010": 1, "2001": 2}}, "averageNumberOfCitationsPerPaper": 1.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 2, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 1, "citationsFromAllPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 0, "numberOfCitationsPerYear": {}}, "averageNumberOfCitationsPerPaper": 0.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 0, "100": 0, "500": 0, "50": 0}}}}} |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/test/resources/eu/dnetlib/iis/statistics/main/sampledataproducer/data/global_statistics.json | ||
---|---|---|
1 |
{"allPapers": {"numberOfPapers": 4, "citationsFromAllPapers": {"basic": {"numberOfCitations": 7, "numberOfCitationsPerYear": {"2010": 1, "2001": 6}}, "averageNumberOfCitationsPerPaper": 1.75, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 3, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 4, "numberOfCitationsPerYear": {"2010": 1, "2001": 3}}, "averageNumberOfCitationsPerPaper": 1.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 3, "100": 0, "500": 0, "50": 0}}}, "publishedPapers": {"numberOfPapers": 2, "citationsFromAllPapers": {"basic": {"numberOfCitations": 2, "numberOfCitationsPerYear": {"2001": 2}}, "averageNumberOfCitationsPerPaper": 1.0, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}, "citationsFromPublishedPapers": {"basic": {"numberOfCitations": 1, "numberOfCitationsPerYear": {"2001": 1}}, "averageNumberOfCitationsPerPaper": 0.5, "numberOfPapersCitedAtLeastXTimes": {"250": 0, "10": 0, "1": 1, "100": 0, "500": 0, "50": 0}}}} |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/test/resources/eu/dnetlib/iis/statistics/main/sampledataproducer/data/document_authors_citations.json | ||
---|---|---|
1 |
{"projectIds": ["2", "4", "7"], "referencedDocumentsIds": ["id-2", "id-4", "id-3"], "authorIds": ["id-1", "id-2", "id-123", "id-800"], "isPublished": true, "year":"2001", "documentId": "id-1"} |
|
2 |
{"projectIds": ["7", "4"], "referencedDocumentsIds": [], "authorIds": ["id-1", "id-3", "id-123"], "isPublished": false, "year":"2011", "documentId": "id-2"} |
|
3 |
{"projectIds": [], "referencedDocumentsIds": ["id-4"], "authorIds": ["id-345"], "isPublished": true, "year":"2010", "documentId": "id-3"} |
|
4 |
{"projectIds": ["7"], "referencedDocumentsIds": ["id-3", "id-2", "id-4"], "authorIds": [], "isPublished": false, "year":"2001", "documentId": "id-4"} |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/test/resources/eu/dnetlib/iis/statistics/main/sampledataproducer/data/person_id.json | ||
---|---|---|
1 |
{"id": "id-1"} |
|
2 |
{"id": "id-2"} |
|
3 |
{"id": "id-3"} |
|
4 |
{"id": "id-345"} |
|
5 |
{"id": "id-800"} |
|
6 |
{"id": "id-590"} |
|
7 |
{"id": "id-123"} |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/test/resources/eu/dnetlib/iis/statistics/main/sampledataproducer/data/project_id.json | ||
---|---|---|
1 |
{"id":"1"} |
|
2 |
{"id":"2"} |
|
3 |
{"id":"3"} |
|
4 |
{"id":"4"} |
|
5 |
{"id":"7"} |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/test/resources/eu/dnetlib/iis/statistics/main/sampledataproducer/oozie_app/import.txt | ||
---|---|---|
1 |
## This is a classpath-based import file (this header is required) |
|
2 |
statistics_main classpath eu/dnetlib/iis/statistics/main/oozie_app |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/src/test/resources/eu/dnetlib/iis/statistics/main/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
1 |
<workflow-app xmlns="uri:oozie:workflow:0.2" name="test-statistics_main"> |
|
2 |
<start to="producer"/> |
|
3 |
<action name="producer"> |
|
4 |
<java> |
|
5 |
<job-tracker>${jobTracker}</job-tracker> |
|
6 |
<name-node>${nameNode}</name-node> |
|
7 |
<!-- The data generated by this node is deleted in this section --> |
|
8 |
<prepare> |
|
9 |
<delete path="${nameNode}${workingDir}/producer" /> |
|
10 |
<mkdir path="${nameNode}${workingDir}/producer" /> |
|
11 |
</prepare> |
|
12 |
<configuration> |
|
13 |
<property> |
|
14 |
<name>mapred.job.queue.name</name> |
|
15 |
<value>${queueName}</value> |
|
16 |
</property> |
|
17 |
</configuration> |
|
18 |
<!-- This is simple wrapper for the Java code --> |
|
19 |
<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class> |
|
20 |
<!-- The business Java code that gets to be executed --> |
|
21 |
<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg> |
|
22 |
<!-- Specification of the output ports --> |
|
23 |
<arg>-C{document_authors_citations, |
|
24 |
eu.dnetlib.iis.statistics.schemas.DocumentWithAuthorsAndCitations, |
|
25 |
eu/dnetlib/iis/statistics/main/sampledataproducer/data/document_authors_citations.json}</arg> |
|
26 |
<arg>-C{person_id, |
|
27 |
eu.dnetlib.iis.statistics.schemas.PersonId, |
|
28 |
eu/dnetlib/iis/statistics/main/sampledataproducer/data/person_id.json}</arg> |
|
29 |
<arg>-C{project_id, |
|
30 |
eu.dnetlib.iis.statistics.schemas.ProjectId, |
|
31 |
eu/dnetlib/iis/statistics/main/sampledataproducer/data/project_id.json}</arg> |
|
32 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
33 |
directory has to be specified as well --> |
|
34 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
35 |
<arg>-Odocument_authors_citations=${workingDir}/producer/document_authors_citations</arg> |
|
36 |
<arg>-Operson_id=${workingDir}/producer/person_id</arg> |
|
37 |
<arg>-Oproject_id=${workingDir}/producer/project_id</arg> |
|
38 |
|
|
39 |
</java> |
|
40 |
<ok to="statistics_main"/> |
|
41 |
<error to="fail"/> |
|
42 |
</action> |
|
43 |
<action name="statistics_main"> |
|
44 |
<sub-workflow> |
|
45 |
<app-path>${wf:appPath()}/statistics_main</app-path> |
|
46 |
<configuration> |
|
47 |
<property> |
|
48 |
<name>jobTracker</name> |
|
49 |
<value>${jobTracker}</value> |
|
50 |
</property> |
|
51 |
<property> |
|
52 |
<name>nameNode</name> |
|
53 |
<value>${nameNode}</value> |
|
54 |
</property> |
|
55 |
<property> |
|
56 |
<name>queueName</name> |
|
57 |
<value>${queueName}</value> |
|
58 |
</property> |
|
59 |
<!-- Working directory of the subworkflow --> |
|
60 |
<property> |
|
61 |
<name>workingDir</name> |
|
62 |
<value>${workingDir}/statistics_main/working_dir</value> |
|
63 |
</property> |
|
64 |
<!-- Input ports. --> |
|
65 |
<property> |
|
66 |
<name>input_document_authors_citations</name> |
|
67 |
<value>${workingDir}/producer/document_authors_citations</value> |
|
68 |
</property> |
|
69 |
<property> |
|
70 |
<name>input_person_id</name> |
|
71 |
<value>${workingDir}/producer/person_id</value> |
|
72 |
</property> |
|
73 |
<property> |
|
74 |
<name>input_project_id</name> |
|
75 |
<value>${workingDir}/producer/project_id</value> |
|
76 |
</property> |
|
77 |
<!-- Output port bound to given path --> |
|
78 |
<property> |
|
79 |
<name>output_document_statistics</name> |
|
80 |
<value>${workingDir}/statistics_main/document_statistics</value> |
|
81 |
</property> |
|
82 |
<property> |
|
83 |
<name>output_author_statistics</name> |
|
84 |
<value>${workingDir}/statistics_main/author_statistics</value> |
|
85 |
</property> |
|
86 |
<property> |
|
87 |
<name>output_project_statistics</name> |
|
88 |
<value>${workingDir}/statistics_main/project_statistics</value> |
|
89 |
</property> |
|
90 |
<property> |
|
91 |
<name>output_global_statistics</name> |
|
92 |
<value>${workingDir}/statistics_main/global_statistics</value> |
|
93 |
</property> |
|
94 |
</configuration> |
|
95 |
</sub-workflow> |
|
96 |
<ok to="consumer"/> |
|
97 |
<error to="fail"/> |
|
98 |
</action> |
|
99 |
<action name="consumer"> |
|
100 |
<java> |
|
101 |
<job-tracker>${jobTracker}</job-tracker> |
|
102 |
<name-node>${nameNode}</name-node> |
|
103 |
<!-- The data generated by this node is deleted in this section --> |
|
104 |
<prepare> |
|
105 |
<delete path="${nameNode}${workingDir}/consumer" /> |
|
106 |
<mkdir path="${nameNode}${workingDir}/consumer" /> |
|
107 |
</prepare> |
|
108 |
<configuration> |
|
109 |
<property> |
|
110 |
<name>mapred.job.queue.name</name> |
|
111 |
<value>${queueName}</value> |
|
112 |
</property> |
|
113 |
</configuration> |
|
114 |
<!-- This is simple wrapper for the Java code --> |
|
115 |
<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class> |
|
116 |
<!-- The business Java code that gets to be executed --> |
|
117 |
<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.TestingConsumer</arg> |
|
118 |
<!-- Specification of the input ports --> |
|
119 |
<arg>-C{document_statistics, |
|
120 |
eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics, |
|
121 |
eu/dnetlib/iis/statistics/main/sampledataproducer/data/document_to_document_statistics.json}</arg> |
|
122 |
<arg>-C{author_statistics, |
|
123 |
eu.dnetlib.iis.statistics.schemas.AuthorToAuthorStatistics, |
|
124 |
eu/dnetlib/iis/statistics/main/sampledataproducer/data/author_to_author_statistics.json}</arg> |
|
125 |
<arg>-C{project_statistics, |
|
126 |
eu.dnetlib.iis.statistics.schemas.ProjectToProjectStatistics, |
|
127 |
eu/dnetlib/iis/statistics/main/sampledataproducer/data/project_to_project_statistics.json}</arg> |
|
128 |
<arg>-C{global_statistics, |
|
129 |
eu.dnetlib.iis.statistics.schemas.CommonCoreStatistics, |
|
130 |
eu/dnetlib/iis/statistics/main/sampledataproducer/data/global_statistics.json}</arg> |
|
131 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
132 |
directory has to be specified as well --> |
|
133 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
134 |
<arg>-Idocument_statistics=${workingDir}/statistics_main/document_statistics</arg> |
|
135 |
<arg>-Iauthor_statistics=${workingDir}/statistics_main/author_statistics</arg> |
|
136 |
<arg>-Iproject_statistics=${workingDir}/statistics_main/project_statistics</arg> |
|
137 |
<arg>-Iglobal_statistics=${workingDir}/statistics_main/global_statistics</arg> |
|
138 |
</java> |
|
139 |
<ok to="end" /> |
|
140 |
<error to="fail" /> |
|
141 |
</action> |
|
142 |
<kill name="fail"> |
|
143 |
<message>Unfortunately, the workflow failed -- error message: |
|
144 |
[${wf:errorMessage(wf:lastErrorNode())}]</message> |
|
145 |
</kill> |
|
146 |
<end name="end"/> |
|
147 |
</workflow-app> |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/deploy.info | ||
---|---|---|
1 |
{ |
|
2 |
"type_source": "SVN", |
|
3 |
"goal": "package -U -T 4C source:jar", |
|
4 |
"url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/icm-iis-statistics/trunk/", |
|
5 |
"deploy_repository": "dnet4-snapshots", |
|
6 |
"version": "4", |
|
7 |
"mail": "m.horst@icm.edu.pl,d.tkaczyk@icm.edu.pl", |
|
8 |
"deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", |
|
9 |
"name": "icm-iis-statistics" |
|
10 |
} |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/core/src/test/resources/test-custom-log4j.properties | ||
---|---|---|
1 |
# |
|
2 |
# Licensed to the Apache Software Foundation (ASF) under one |
|
3 |
# or more contributor license agreements. See the NOTICE file |
|
4 |
# distributed with this work for additional information |
|
5 |
# regarding copyright ownership. The ASF licenses this file |
|
6 |
# to you under the Apache License, Version 2.0 (the |
|
7 |
# "License"); you may not use this file except in compliance |
|
8 |
# with the License. You may obtain a copy of the License at |
|
9 |
# |
|
10 |
# http://www.apache.org/licenses/LICENSE-2.0 |
|
11 |
# |
|
12 |
# Unless required by applicable law or agreed to in writing, software |
|
13 |
# distributed under the License is distributed on an "AS IS" BASIS, |
|
14 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
15 |
# See the License for the specific language governing permissions and |
|
16 |
# limitations under the License. |
|
17 |
# |
|
18 |
|
|
19 |
# http://www.apache.org/licenses/LICENSE-2.0 |
|
20 |
# |
|
21 |
# Unless required by applicable law or agreed to in writing, software |
|
22 |
# distributed under the License is distributed on an "AS IS" BASIS, |
|
23 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
24 |
# See the License for the specific language governing permissions and |
|
25 |
# limitations under the License. See accompanying LICENSE file. |
|
26 |
|
|
27 |
# |
|
28 |
|
|
29 |
log4j.appender.oozie=org.apache.log4j.ConsoleAppender |
|
30 |
log4j.appender.oozie.Target=System.out |
|
31 |
log4j.appender.oozie.layout=org.apache.log4j.PatternLayout |
|
32 |
log4j.appender.oozie.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n |
|
33 |
|
|
34 |
log4j.appender.null=org.apache.log4j.varia.NullAppender |
|
35 |
|
|
36 |
log4j.logger.org.apache=INFO, oozie |
|
37 |
log4j.logger.org.mortbay=WARN, oozie |
|
38 |
log4j.logger.org.hsqldb=WARN, oozie |
|
39 |
|
|
40 |
log4j.logger.opslog=NONE, null |
|
41 |
log4j.logger.applog=NONE, null |
|
42 |
log4j.logger.instrument=NONE, null |
|
43 |
|
|
44 |
log4j.logger.a=NONE, null |
|
45 |
|
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/core/src/test/resources/hsqldb-oozie-site.xml | ||
---|---|---|
1 |
<?xml version="1.0"?> |
|
2 |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
|
3 |
<!-- |
|
4 |
Copyright (c) 2010 Yahoo! Inc. All rights reserved. |
|
5 |
Licensed under the Apache License, Version 2.0 (the "License"); |
|
6 |
you may not use this file except in compliance with the License. |
|
7 |
You may obtain a copy of the License at |
|
8 |
|
|
9 |
http://www.apache.org/licenses/LICENSE-2.0 |
|
10 |
|
|
11 |
Unless required by applicable law or agreed to in writing, software |
|
12 |
distributed under the License is distributed on an "AS IS" BASIS, |
|
13 |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
14 |
See the License for the specific language governing permissions and |
|
15 |
limitations under the License. See accompanying LICENSE file. |
|
16 |
--> |
|
17 |
<configuration> |
|
18 |
<property> |
|
19 |
<name>oozie.service.JPAService.jdbc.driver</name> |
|
20 |
<value>org.hsqldb.jdbcDriver</value> |
|
21 |
</property> |
|
22 |
<property> |
|
23 |
<name>oozie.service.JPAService.jdbc.url</name> |
|
24 |
<value>jdbc:hsqldb:mem:oozie-db;create=true</value> |
|
25 |
</property> |
|
26 |
</configuration> |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/core/src/test/resources/mysql-oozie-site.xml | ||
---|---|---|
1 |
<?xml version="1.0"?> |
|
2 |
<!-- |
|
3 |
Licensed to the Apache Software Foundation (ASF) under one |
|
4 |
or more contributor license agreements. See the NOTICE file |
|
5 |
distributed with this work for additional information |
|
6 |
regarding copyright ownership. The ASF licenses this file |
|
7 |
to you under the Apache License, Version 2.0 (the |
|
8 |
"License"); you may not use this file except in compliance |
|
9 |
with the License. You may obtain a copy of the License at |
|
10 |
|
|
11 |
http://www.apache.org/licenses/LICENSE-2.0 |
|
12 |
|
|
13 |
Unless required by applicable law or agreed to in writing, software |
|
14 |
distributed under the License is distributed on an "AS IS" BASIS, |
|
15 |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
16 |
See the License for the specific language governing permissions and |
|
17 |
limitations under the License. |
|
18 |
--> |
|
19 |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
|
20 |
<configuration> |
|
21 |
<property> |
|
22 |
<name>oozie.service.JPAService.jdbc.driver</name> |
|
23 |
<value>com.mysql.jdbc.Driver</value> |
|
24 |
<description>JDBC driver class.</description> |
|
25 |
</property> |
|
26 |
<property> |
|
27 |
<name>oozie.test.db.port</name> |
|
28 |
<value>3306</value> |
|
29 |
</property> |
|
30 |
<property> |
|
31 |
<name>oozie.service.JPAService.jdbc.url</name> |
|
32 |
<value>jdbc:mysql://${oozie.test.db.host}:${oozie.test.db.port}/oozie</value> |
|
33 |
<description>JDBC URL.</description> |
|
34 |
</property> |
|
35 |
<property> |
|
36 |
<name>oozie.service.JPAService.jdbc.username</name> |
|
37 |
<value>oozie</value> |
|
38 |
<description>DB user name.</description> |
|
39 |
</property> |
|
40 |
<property> |
|
41 |
<name>oozie.service.JPAService.jdbc.password</name> |
|
42 |
<value>oozie</value> |
|
43 |
<description> |
|
44 |
DB user password. IMPORTANT: if password is emtpy leave a 1 space string, the service trims the |
|
45 |
value, if empty Configuration assumes it is NULL. |
|
46 |
</description> |
|
47 |
</property> |
|
48 |
</configuration> |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/core/src/test/resources/oracle-oozie-site.xml | ||
---|---|---|
1 |
<?xml version="1.0"?> |
|
2 |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
|
3 |
<!-- |
|
4 |
Copyright (c) 2010 Yahoo! Inc. All rights reserved. |
|
5 |
Licensed under the Apache License, Version 2.0 (the "License"); |
|
6 |
you may not use this file except in compliance with the License. |
|
7 |
You may obtain a copy of the License at |
|
8 |
|
|
9 |
http://www.apache.org/licenses/LICENSE-2.0 |
|
10 |
|
|
11 |
Unless required by applicable law or agreed to in writing, software |
|
12 |
distributed under the License is distributed on an "AS IS" BASIS, |
|
13 |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
14 |
See the License for the specific language governing permissions and |
|
15 |
limitations under the License. See accompanying LICENSE file. |
|
16 |
--> |
|
17 |
<configuration> |
|
18 |
<property> |
|
19 |
<name>oozie.service.JPAService.jdbc.driver</name> |
|
20 |
<value>oracle.jdbc.driver.OracleDriver</value> |
|
21 |
</property> |
|
22 |
<property> |
|
23 |
<name>oozie.test.db.port</name> |
|
24 |
<value>1521</value> |
|
25 |
</property> |
|
26 |
<property> |
|
27 |
<name>oozie.test.db.name</name> |
|
28 |
<value>xe</value> |
|
29 |
</property> |
|
30 |
<property> |
|
31 |
<name>oozie.service.JPAService.jdbc.url</name> |
|
32 |
<value>jdbc:oracle:thin:@//${oozie.test.db.host}:${oozie.test.db.port}/${oozie.test.db.name}</value> |
|
33 |
</property> |
|
34 |
<property> |
|
35 |
<name>oozie.service.JPAService.jdbc.username</name> |
|
36 |
<value>oozie</value> |
|
37 |
</property> |
|
38 |
<property> |
|
39 |
<name>oozie.service.JPAService.jdbc.password</name> |
|
40 |
<value>oozie</value> |
|
41 |
</property> |
|
42 |
</configuration> |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/core/src/test/resources/postgres-oozie-site.xml | ||
---|---|---|
1 |
<?xml version="1.0"?> |
|
2 |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
|
3 |
<!-- |
|
4 |
Copyright (c) 2010 Yahoo! Inc. All rights reserved. |
|
5 |
Licensed under the Apache License, Version 2.0 (the "License"); |
|
6 |
you may not use this file except in compliance with the License. |
|
7 |
You may obtain a copy of the License at |
|
8 |
|
|
9 |
http://www.apache.org/licenses/LICENSE-2.0 |
|
10 |
|
|
11 |
Unless required by applicable law or agreed to in writing, software |
|
12 |
distributed under the License is distributed on an "AS IS" BASIS, |
|
13 |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
14 |
See the License for the specific language governing permissions and |
|
15 |
limitations under the License. See accompanying LICENSE file. |
|
16 |
--> |
|
17 |
<configuration> |
|
18 |
<property> |
|
19 |
<name>oozie.service.JPAService.jdbc.driver</name> |
|
20 |
<value>org.postgresql.Driver</value> |
|
21 |
</property> |
|
22 |
<property> |
|
23 |
<name>oozie.test.db.port</name> |
|
24 |
<value>5432</value> |
|
25 |
</property> |
|
26 |
<property> |
|
27 |
<name>oozie.test.db.name</name> |
|
28 |
<value>oozie</value> |
|
29 |
</property> |
|
30 |
<property> |
|
31 |
<name>oozie.service.JPAService.jdbc.url</name> |
|
32 |
<value>jdbc:postgresql://${oozie.test.db.host}:${oozie.test.db.port}/${oozie.test.db.name}</value> |
|
33 |
</property> |
|
34 |
<property> |
|
35 |
<name>oozie.service.JPAService.jdbc.username</name> |
|
36 |
<value>oozie</value> |
|
37 |
</property> |
|
38 |
<property> |
|
39 |
<name>oozie.service.JPAService.jdbc.password</name> |
|
40 |
<value>oozie</value> |
|
41 |
</property> |
|
42 |
</configuration> |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/core/src/test/resources/test-oozie-log4j.properties | ||
---|---|---|
1 |
# |
|
2 |
# Licensed to the Apache Software Foundation (ASF) under one |
|
3 |
# or more contributor license agreements. See the NOTICE file |
|
4 |
# distributed with this work for additional information |
|
5 |
# regarding copyright ownership. The ASF licenses this file |
|
6 |
# to you under the Apache License, Version 2.0 (the |
|
7 |
# "License"); you may not use this file except in compliance |
|
8 |
# with the License. You may obtain a copy of the License at |
|
9 |
# |
|
10 |
# http://www.apache.org/licenses/LICENSE-2.0 |
|
11 |
# |
|
12 |
# Unless required by applicable law or agreed to in writing, software |
|
13 |
# distributed under the License is distributed on an "AS IS" BASIS, |
|
14 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
15 |
# See the License for the specific language governing permissions and |
|
16 |
# limitations under the License. |
|
17 |
# |
|
18 |
|
|
19 |
# http://www.apache.org/licenses/LICENSE-2.0 |
|
20 |
# |
|
21 |
# Unless required by applicable law or agreed to in writing, software |
|
22 |
# distributed under the License is distributed on an "AS IS" BASIS, |
|
23 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
24 |
# See the License for the specific language governing permissions and |
|
25 |
# limitations under the License. See accompanying LICENSE file. |
|
26 |
|
|
27 |
# |
|
28 |
|
|
29 |
log4j.appender.oozie=org.apache.log4j.ConsoleAppender |
|
30 |
log4j.appender.oozie.Target=System.out |
|
31 |
log4j.appender.oozie.layout=org.apache.log4j.PatternLayout |
|
32 |
log4j.appender.oozie.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n |
|
33 |
|
|
34 |
log4j.appender.null=org.apache.log4j.varia.NullAppender |
|
35 |
|
|
36 |
log4j.logger.org.apache=INFO, oozie |
|
37 |
log4j.logger.org.mortbay=WARN, oozie |
|
38 |
log4j.logger.org.hsqldb=WARN, oozie |
|
39 |
|
|
40 |
log4j.logger.opslog=NONE, null |
|
41 |
log4j.logger.applog=NONE, null |
|
42 |
log4j.logger.instrument=NONE, null |
|
43 |
|
|
44 |
log4j.logger.a=ALL, null |
|
45 |
|
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/core/src/test/resources/hadoop-config.xml | ||
---|---|---|
1 |
<?xml version="1.0"?> |
|
2 |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
|
3 |
<!-- |
|
4 |
Licensed to the Apache Software Foundation (ASF) under one |
|
5 |
or more contributor license agreements. See the NOTICE file |
|
6 |
distributed with this work for additional information |
|
7 |
regarding copyright ownership. The ASF licenses this file |
|
8 |
to you under the Apache License, Version 2.0 (the |
|
9 |
"License"); you may not use this file except in compliance |
|
10 |
with the License. You may obtain a copy of the License at |
|
11 |
|
|
12 |
http://www.apache.org/licenses/LICENSE-2.0 |
|
13 |
|
|
14 |
Unless required by applicable law or agreed to in writing, software |
|
15 |
distributed under the License is distributed on an "AS IS" BASIS, |
|
16 |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
17 |
See the License for the specific language governing permissions and |
|
18 |
limitations under the License. |
|
19 |
--> |
|
20 |
<configuration> |
|
21 |
|
|
22 |
<property> |
|
23 |
<name>mapreduce.jobtracker.kerberos.principal</name> |
|
24 |
<value>mapred/_HOST@LOCALREALM</value> |
|
25 |
</property> |
|
26 |
|
|
27 |
<property> |
|
28 |
<name>dfs.namenode.kerberos.principal</name> |
|
29 |
<value>hdfs/_HOST@LOCALREALM</value> |
|
30 |
</property> |
|
31 |
|
|
32 |
<property> |
|
33 |
<name>mapreduce.framework.name</name> |
|
34 |
<value>yarn</value> |
|
35 |
</property> |
|
36 |
|
|
37 |
</configuration> |
modules/icm-iis-statistics/tags/icm-iis-statistics-1.0.0/core/README.md | ||
---|---|---|
1 |
This directory and its subdirectories and files are here as a hack to make the Oozie unit tests work. |
|
2 |
|
|
3 |
Details |
|
4 |
------- |
|
5 |
Oozie tests assume that they're placed inside directory tree of Oozie source code -- see the source code of class `XTestCase` which is an ancestor of `MiniOozieTestCase` class which, in turn, should be inherited by your test case class. |
|
6 |
|
|
7 |
How to get the source code of the `XTestCase` class: |
|
8 |
|
|
9 |
- download source code of the Ubuntu's `oozie` package prepared by Cloudera (`apt-get source oozie`). It is version 3.1.3+155 of this package. |
|
10 |
- open file `oozie-3.1.3+155/src/core/src/test/java/org/apache/oozie/test/XTestCase.java` and look at lines 93-105. |
Also available in: Unified diff
[maven-release-plugin] copy for tag icm-iis-statistics-1.0.0