Revision 41818
Added by Eri Katsari over 8 years ago
modules/dnet-openaire-stats/branches/extended/src/test/java/eu/dnetlib/data/mapreduce/hbase/statsExport/utils/GeneralTests.java | ||
---|---|---|
106 | 106 |
} |
107 | 107 |
|
108 | 108 |
|
109 |
|
|
110 | 109 |
@Test |
111 | 110 |
public void testPath() { |
112 | 111 |
|
... | ... | |
120 | 119 |
@Test |
121 | 120 |
public void test() { |
122 | 121 |
|
123 |
Integer s = new Integer(5); |
|
124 |
// s="lala"; |
|
125 |
System.out.println(s); |
|
126 |
test2(s); |
|
127 |
System.out.println("s outside " + s); |
|
122 |
String value = "d'Analisi,"; |
|
123 |
value = value.replaceAll("'", " "); |
|
124 |
value = value.replaceAll("\\r\\n|\\r|\\n", " "); |
|
125 |
value = value.replaceAll("\\s+", " "); |
|
128 | 126 |
|
127 |
value = value.replaceAll("(\\r|\\n)", " "); |
|
128 |
value = value.replaceAll("\\t", " "); |
|
129 |
value = value.replace("\n", " "); |
|
130 |
value = value.replaceAll("[\"\\r\\\\;]", ""); |
|
131 |
|
|
132 |
value.replace(">", " "); |
|
133 |
value.replace("<", " "); |
|
134 |
value.replace(",", " "); |
|
135 |
value.replace("\"", " "); |
|
136 |
value.replace("'", " "); |
|
137 |
value = value.replaceAll("[^a-zA-Z0-9 .-_:/@+=]+", " "); |
|
138 |
System.out.println(value); |
|
139 |
|
|
129 | 140 |
} |
130 | 141 |
|
131 | 142 |
public void test2(Integer s) { |
132 | 143 |
Integer a = new Integer(10); |
133 | 144 |
s = 10; |
134 | 145 |
|
146 |
|
|
135 | 147 |
System.out.println("s inside " + s); |
136 | 148 |
|
137 | 149 |
|
modules/dnet-openaire-stats/branches/extended/src/main/java/eu/dnetlib/data/mapreduce/hbase/statsExport/daos/UsageStatsDAO.java | ||
---|---|---|
6 | 6 |
import org.apache.hadoop.fs.FileSystem; |
7 | 7 |
import org.apache.hadoop.fs.Path; |
8 | 8 |
import org.apache.log4j.Logger; |
9 |
|
|
9 | 10 |
import org.springframework.jdbc.datasource.DriverManagerDataSource; |
10 | 11 |
|
12 |
import javax.sql.DataSource; |
|
11 | 13 |
import java.io.File; |
12 |
import java.sql.Connection; |
|
13 |
import java.sql.ResultSet; |
|
14 |
import java.sql.ResultSetMetaData; |
|
15 |
import java.sql.Statement; |
|
14 |
import java.sql.*; |
|
16 | 15 |
|
17 | 16 |
/** |
18 | 17 |
* @author eri |
... | ... | |
26 | 25 |
private Logger log = Logger.getLogger(this.getClass()); |
27 | 26 |
private String delim; |
28 | 27 |
private String outputPath; |
28 |
private String dbUrl; |
|
29 | 29 |
|
30 | 30 |
public UsageStatsDAO(String dbUrl, String dbDriver, String delim, String outputPath) throws Exception { |
31 |
usageStatsDatasource = new DriverManagerDataSource(); |
|
31 |
|
|
32 |
usageStatsDatasource = new DriverManagerDataSource(dbUrl); |
|
32 | 33 |
usageStatsDatasource.setDriverClassName(dbDriver); |
33 |
usageStatsDatasource.setUrl(dbUrl); |
|
34 |
|
|
34 | 35 |
this.delim = delim; |
35 | 36 |
this.outputPath = outputPath; |
37 |
this.dbUrl = dbUrl; |
|
36 | 38 |
|
37 | 39 |
log.info("Usage Stats DB url" + dbUrl); |
38 | 40 |
|
39 |
String dbName = dbUrl.substring(dbUrl.indexOf("/"), dbUrl.length()); |
|
40 |
File file = new File(dbName); |
|
41 |
|
|
42 |
if (file.exists()) //here's how to check |
|
43 |
{ |
|
44 |
log.info("Database " + dbName + "exists"); |
|
45 |
|
|
46 |
} else { |
|
47 |
log.error("Database " + dbName + "not found"); |
|
48 |
throw new Exception("Database not found"); |
|
49 |
} |
|
50 | 41 |
} |
51 | 42 |
|
52 | 43 |
public void getUsageStatistics(String table) throws Exception { |
53 | 44 |
|
54 | 45 |
log.info("Getting Usage Statistics for " + table); |
55 | 46 |
// String q = "select * from " + table + "Stats"; |
56 |
String q = "SELECT * from datasourceStats"; |
|
47 |
String q = "SELECT * from datasourceStats; ";
|
|
57 | 48 |
Connection con = null; |
58 | 49 |
|
50 |
//con = DriverManager.getConnection(dbUrl); |
|
51 |
|
|
59 | 52 |
con = usageStatsDatasource.getConnection(); |
60 | 53 |
|
61 | 54 |
if (con == null) { |
62 |
log.error("Database not found");
|
|
63 |
throw new Exception("Database not found");
|
|
55 |
log.error("cannot open connection to Database ");
|
|
56 |
throw new Exception("cannot open connection to Database");
|
|
64 | 57 |
} |
65 | 58 |
|
66 | 59 |
Statement st = con.createStatement(); |
... | ... | |
132 | 125 |
} |
133 | 126 |
|
134 | 127 |
|
135 |
public DriverManagerDataSource getUsageStatsDatasource() { |
|
136 |
return usageStatsDatasource; |
|
137 |
} |
|
138 |
|
|
139 |
public void setUsageStatsDatasource(DriverManagerDataSource usageStatsDatasource) { |
|
140 |
this.usageStatsDatasource = usageStatsDatasource; |
|
141 |
} |
|
142 |
|
|
143 | 128 |
public String getDelim() { |
144 | 129 |
return delim; |
145 | 130 |
} |
modules/dnet-openaire-stats/branches/extended/src/main/java/eu/dnetlib/data/mapreduce/hbase/statsExport/drivers/SqoopImportDriver.java | ||
---|---|---|
63 | 63 |
"--target-dir", outputPath, |
64 | 64 |
"--verbose", |
65 | 65 |
"--input-fields-terminated-by", delim, |
66 |
"--driver", driver, |
|
67 |
"--mapreduce-job-name", "Sqoop Usage Stats Import Job for " + table.getKey() |
|
66 |
"--driver", driver |
|
67 |
// "--relaxed-isolation", |
|
68 |
//"--mapreduce-job-name", "Sqoop Usage Stats Import Job for " + table.getKey() |
|
68 | 69 |
}; |
69 | 70 |
|
70 | 71 |
int ret = Sqoop.runTool(str); |
modules/dnet-openaire-stats/branches/full_export/src/main/java/eu/dnetlib/data/mapreduce/hbase/statsExport/utils/StaticSerializer.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils; |
|
2 |
|
|
3 |
import com.google.common.collect.ArrayListMultimap; |
|
4 |
import com.google.common.collect.Multimap; |
|
5 |
import eu.dnetlib.data.mapreduce.util.LicenseComparator; |
|
6 |
import eu.dnetlib.data.proto.DatasourceProtos.Datasource; |
|
7 |
import eu.dnetlib.data.proto.DatasourceProtos.Datasource.Metadata; |
|
8 |
import eu.dnetlib.data.proto.FieldTypeProtos; |
|
9 |
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; |
|
10 |
import eu.dnetlib.data.proto.FieldTypeProtos.StringField; |
|
11 |
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; |
|
12 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
|
13 |
import eu.dnetlib.data.proto.OafProtos.OafEntity; |
|
14 |
import eu.dnetlib.data.proto.OafProtos.OafRel; |
|
15 |
import eu.dnetlib.data.proto.OrganizationProtos.Organization; |
|
16 |
import eu.dnetlib.data.proto.PersonProtos; |
|
17 |
import eu.dnetlib.data.proto.ProjectProtos.Project; |
|
18 |
import eu.dnetlib.data.proto.RelTypeProtos.RelType; |
|
19 |
import eu.dnetlib.data.proto.ResultProtos.Result; |
|
20 |
import eu.dnetlib.data.proto.ResultProtos.Result.Instance; |
|
21 |
import org.apache.log4j.Logger; |
|
22 |
import org.jsoup.Jsoup; |
|
23 |
|
|
24 |
import java.text.SimpleDateFormat; |
|
25 |
import java.util.Date; |
|
26 |
import java.util.List; |
|
27 |
|
|
28 |
/** |
|
29 |
* @author eri Simple serializer that parses input Oaf Protos and prepares them |
|
30 |
* for sqoop |
|
31 |
*/ |
|
32 |
public class StaticSerializer { |
|
33 |
|
|
34 |
public static String serialize(Oaf oaf, String DELIM, String ENCLOSING) { |
|
35 |
|
|
36 |
switch (oaf.getKind()) { |
|
37 |
case entity: |
|
38 |
OafEntity valueEntity = oaf.getEntity(); |
|
39 |
|
|
40 |
switch (valueEntity.getType()) { |
|
41 |
case datasource: |
|
42 |
|
|
43 |
return buildDatasource(oaf, DELIM, ENCLOSING); |
|
44 |
|
|
45 |
case organization: |
|
46 |
|
|
47 |
return buildOrganization(oaf, DELIM, ENCLOSING); |
|
48 |
|
|
49 |
case project: |
|
50 |
|
|
51 |
return buildProject(oaf, DELIM, ENCLOSING); |
|
52 |
case result: |
|
53 |
|
|
54 |
return buildResult(oaf, DELIM, ENCLOSING); |
|
55 |
case person: |
|
56 |
return buildPerson(oaf, DELIM, ENCLOSING); |
|
57 |
default: |
|
58 |
break; |
|
59 |
} |
|
60 |
break; |
|
61 |
case relation: |
|
62 |
OafRel valueRel = oaf.getRel(); |
|
63 |
return buildRel(valueRel, DELIM, ENCLOSING); |
|
64 |
|
|
65 |
} |
|
66 |
|
|
67 |
return null; |
|
68 |
|
|
69 |
} |
|
70 |
|
|
71 |
public static String serialize(OafRel oaf, String DELIM, String ENCLOSING) { |
|
72 |
|
|
73 |
switch (oaf.getRelType()) { |
|
74 |
case resultProject: |
|
75 |
return getResultProject(oaf, DELIM, ENCLOSING); |
|
76 |
default: |
|
77 |
return buildRel(oaf, DELIM, ENCLOSING); |
|
78 |
} |
|
79 |
} |
|
80 |
|
|
81 |
private static String buildRel(OafRel Rel, String DELIM, String ENCLOSING) { |
|
82 |
return getStringField(Rel.getTarget(), DELIM, ENCLOSING); |
|
83 |
} |
|
84 |
|
|
85 |
public static Multimap<String, String> extractRelations(Oaf oaf, String DELIM, String ENCLOSING) { |
|
86 |
OafEntity valueEntity = oaf.getEntity(); |
|
87 |
Multimap<String, String> relations = ArrayListMultimap.create(); |
|
88 |
|
|
89 |
getOriginalId(valueEntity, relations, DELIM, ENCLOSING); |
|
90 |
|
|
91 |
switch (valueEntity.getType()) { |
|
92 |
case datasource: |
|
93 |
getDatasourceLanguages(valueEntity, relations, DELIM, ENCLOSING); |
|
94 |
case result: |
|
95 |
getResultTopics(valueEntity, relations, DELIM, ENCLOSING); |
|
96 |
getResultLanguages(valueEntity, relations, DELIM, ENCLOSING); |
|
97 |
getResultClassifications(valueEntity, relations, DELIM, ENCLOSING); |
|
98 |
getResultDatasources(valueEntity, relations, DELIM, ENCLOSING); |
|
99 |
getResultConcepts(valueEntity, relations, DELIM, ENCLOSING); |
|
100 |
getResultDois(valueEntity, relations, DELIM, ENCLOSING); |
|
101 |
getResultCitations(valueEntity, relations, DELIM, ENCLOSING); |
|
102 |
getResultDescriptions(valueEntity, relations, DELIM, ENCLOSING); |
|
103 |
return relations; |
|
104 |
|
|
105 |
case project: |
|
106 |
getProjectKeywords(valueEntity, relations, DELIM, ENCLOSING); |
|
107 |
getProjectSubjects(valueEntity, relations, DELIM, ENCLOSING); |
|
108 |
|
|
109 |
default: |
|
110 |
return null; |
|
111 |
} |
|
112 |
|
|
113 |
} |
|
114 |
|
|
115 |
|
|
116 |
private static void getOriginalId(OafEntity oafEntity, Multimap<String, String> relations, String DELIM, String ENCLOSING) { |
|
117 |
|
|
118 |
String relName = oafEntity.getType().toString().toLowerCase() + "Oid"; |
|
119 |
for (String oid : oafEntity.getOriginalIdList()) { |
|
120 |
relations.put(relName, ENCLOSING + oid.replace(DELIM, " ").replace("\n", "") + ENCLOSING + DELIM); |
|
121 |
} |
|
122 |
|
|
123 |
} |
|
124 |
|
|
125 |
private static void getProjectKeywords(OafEntity oafEntity, Multimap<String, String> relations, String DELIM, String ENCLOSING) { |
|
126 |
relations.put("projectKeyword", getStringField(oafEntity.getProject().getMetadata().getKeywords().getValue(), DELIM, ENCLOSING)); |
|
127 |
|
|
128 |
} |
|
129 |
|
|
130 |
private static void getProjectSubjects(OafEntity oafEntity, Multimap<String, String> relations, String DELIM, String ENCLOSING) { |
|
131 |
for (StructuredProperty subj : oafEntity.getProject().getMetadata().getSubjectsList()) { |
|
132 |
relations.put("projectSubject", getStringField(subj.getValue(), DELIM, ENCLOSING)); |
|
133 |
} |
|
134 |
} |
|
135 |
|
|
136 |
private static String getResultProject(OafRel oaf, String DELIM, String ENCLOSING) { |
|
137 |
String buff = new String(); |
|
138 |
String result = oaf.getTarget(); |
|
139 |
|
|
140 |
buff += getStringField(result, DELIM, ENCLOSING); |
|
141 |
// TODO is declared as int!!! |
|
142 |
long diff = DATEDIFF(oaf.getResultProject().getOutcome().getRelMetadata().getEnddate(), oaf.getResultProject().getOutcome().getRelMetadata().getStartdate()); |
|
143 |
if (diff < 0) { |
|
144 |
diff = 0; |
|
145 |
} |
|
146 |
buff += getNumericField(String.valueOf(diff), DELIM, ENCLOSING); |
|
147 |
return buff; |
|
148 |
} |
|
149 |
|
|
150 |
|
|
151 |
private static void getDatasourceLanguages(OafEntity valueEntity, Multimap<String, String> rels, String DELIM, String ENCLOSING) { |
|
152 |
Datasource d = valueEntity.getDatasource(); |
|
153 |
Metadata metadata = d.getMetadata(); |
|
154 |
|
|
155 |
for (StringField lang : metadata.getOdlanguagesList()) { |
|
156 |
rels.put("datasourceLanguage", getStringField(lang.getValue(), DELIM, ENCLOSING)); |
|
157 |
} |
|
158 |
} |
|
159 |
|
|
160 |
private static void getResultLanguages(OafEntity valueEntity, Multimap<String, String> rels, String DELIM, String ENCLOSING) { |
|
161 |
|
|
162 |
Result d = valueEntity.getResult(); |
|
163 |
Result.Metadata metadata = d.getMetadata(); |
|
164 |
if (metadata.getLanguage().getClassname() != null && !metadata.getLanguage().getClassname().isEmpty()) { |
|
165 |
rels.put("resultLanguage", getStringField(metadata.getLanguage().getClassname(), DELIM, ENCLOSING)); |
|
166 |
} |
|
167 |
|
|
168 |
} |
|
169 |
|
|
170 |
private static void getResultDois(OafEntity valueEntity, Multimap<String, String> rels, String DELIM, String ENCLOSING) { |
|
171 |
|
|
172 |
for (StructuredProperty pid : valueEntity.getPidList()) { |
|
173 |
|
|
174 |
rels.put("resultPid", |
|
175 |
getStringField(pid.getQualifier().getClassname(), DELIM, ENCLOSING) + getStringField(pid.getValue(), DELIM, ENCLOSING)); |
|
176 |
} |
|
177 |
} |
|
178 |
|
|
179 |
private static void getResultClassifications(OafEntity valueEntity, Multimap<String, String> rels, String DELIM, String ENCLOSING) { |
|
180 |
|
|
181 |
Result result = valueEntity.getResult(); |
|
182 |
|
|
183 |
for (Instance instance : (result.getInstanceList())) { |
|
184 |
String classification = instance.getInstancetype().getClassname(); |
|
185 |
if (classification != null && !classification.isEmpty()) { |
|
186 |
rels.put("resultClassification", getStringField(instance.getInstancetype().getClassname(), DELIM, ENCLOSING)); |
|
187 |
// TODO HERE KEEP ONLY ONE CLASSIFICATIONS PER RESULT |
|
188 |
break; |
|
189 |
} |
|
190 |
} |
|
191 |
} |
|
192 |
|
|
193 |
private static void getResultDescriptions(OafEntity valueEntity, Multimap<String, String> rels, String DELIM, String ENCLOSING) { |
|
194 |
Result result = valueEntity.getResult(); |
|
195 |
//description |
|
196 |
for (StringField s : result.getMetadata().getDescriptionList()) { |
|
197 |
|
|
198 |
rels.put("resultDescription", getStringField(Jsoup.parse(s.getValue()).text(), DELIM, ENCLOSING)); |
|
199 |
} |
|
200 |
} |
|
201 |
|
|
202 |
private static void getResultConcepts(OafEntity valueEntity, Multimap<String, String> rels, String DELIM, String ENCLOSING) { |
|
203 |
|
|
204 |
Result result = valueEntity.getResult(); |
|
205 |
|
|
206 |
for (Result.Context context : result.getMetadata().getContextList()) { |
|
207 |
|
|
208 |
rels.put("resultConcept", getStringField(context.getId(), DELIM, ENCLOSING)); |
|
209 |
} |
|
210 |
} |
|
211 |
|
|
212 |
|
|
213 |
private static void getResultDatasources(OafEntity valueEntity, Multimap<String, String> rels, String DELIM, String ENCLOSING) { |
|
214 |
Result result = valueEntity.getResult(); |
|
215 |
|
|
216 |
//TODO hosted by |
|
217 |
for (Instance instance : (result.getInstanceList())) { |
|
218 |
String hostedBy = instance.getHostedby().getKey(); |
|
219 |
if (hostedBy != null && !hostedBy.isEmpty()) { |
|
220 |
rels.put("resultDatasource", getStringField(hostedBy, DELIM, ENCLOSING)); |
|
221 |
} |
|
222 |
} |
|
223 |
|
|
224 |
//TODO collected froms |
|
225 |
for (FieldTypeProtos.KeyValue collectedFromValue : (valueEntity.getCollectedfromList())) { |
|
226 |
|
|
227 |
String collectedFrom = collectedFromValue.getKey(); |
|
228 |
if (collectedFrom != null && !collectedFrom.isEmpty()) { |
|
229 |
rels.put("resultDatasource", getStringField(collectedFrom, DELIM, ENCLOSING)); |
|
230 |
} |
|
231 |
} |
|
232 |
} |
|
233 |
|
|
234 |
private static void getResultTopics(OafEntity valueEntity, Multimap<String, String> rels, String DELIM, String ENCLOSING) { |
|
235 |
|
|
236 |
Result d = valueEntity.getResult(); |
|
237 |
Result.Metadata metadata = d.getMetadata(); |
|
238 |
|
|
239 |
List<StructuredProperty> Topics = metadata.getSubjectList(); |
|
240 |
|
|
241 |
for (StructuredProperty topic : Topics) { |
|
242 |
// TODO result topics |
|
243 |
rels.put("resultTopic", getStringField(topic.getValue(), DELIM, ENCLOSING)); |
|
244 |
} |
|
245 |
} |
|
246 |
|
|
247 |
|
|
248 |
private static void getResultCitations(OafEntity oafEntity, Multimap<String, String> rels, String DELIM, String ENCLOSING) { |
|
249 |
for (FieldTypeProtos.ExtraInfo extraInfo : oafEntity.getExtraInfoList()) { |
|
250 |
if (extraInfo.getName().equals("result citations")) { |
|
251 |
rels.put("resultCitation", getStringField(extraInfo.getTrust(), DELIM, ENCLOSING) + |
|
252 |
getStringField(extraInfo.getProvenance(), DELIM, ENCLOSING) + getStringField(extraInfo.getValue(), DELIM, ENCLOSING)); |
|
253 |
} |
|
254 |
|
|
255 |
} |
|
256 |
} |
|
257 |
|
|
258 |
private static String buildDatasource(Oaf oaf, String DELIM, String ENCLOSING) { |
|
259 |
OafEntity data = oaf.getEntity(); |
|
260 |
|
|
261 |
String buff = new String(); |
|
262 |
|
|
263 |
Datasource d = data.getDatasource(); |
|
264 |
Metadata metadata = d.getMetadata(); |
|
265 |
|
|
266 |
|
|
267 |
// name |
|
268 |
if (metadata.getOfficialname().getValue().equalsIgnoreCase("unknown")) { |
|
269 |
buff += getStringField("Unknown Repository", DELIM, ENCLOSING); |
|
270 |
} else { |
|
271 |
buff += getStringField(metadata.getOfficialname().getValue(), DELIM, ENCLOSING); |
|
272 |
} |
|
273 |
// type |
|
274 |
|
|
275 |
if (metadata.hasDatasourcetype()) |
|
276 |
|
|
277 |
{ |
|
278 |
buff += getStringField(metadata.getDatasourcetype().getClassname().replaceFirst(".*::", ""), DELIM, ENCLOSING); |
|
279 |
|
|
280 |
} else { |
|
281 |
buff += getStringField(null, DELIM, ENCLOSING); |
|
282 |
; |
|
283 |
} |
|
284 |
|
|
285 |
// compatibility, |
|
286 |
buff += getStringField(metadata.getOpenairecompatibility().getClassname(), DELIM, ENCLOSING); |
|
287 |
|
|
288 |
// latitude |
|
289 |
buff += getStringField(metadata.getLatitude().getValue(), DELIM, ENCLOSING); |
|
290 |
|
|
291 |
// longtitude |
|
292 |
buff += getStringField(metadata.getLongitude().getValue(), DELIM, ENCLOSING); |
|
293 |
|
|
294 |
// dateofvalidation, |
|
295 |
buff += getStringField(metadata.getDateofvalidation().getValue(), DELIM, ENCLOSING); |
|
296 |
; |
|
297 |
|
|
298 |
// yearofvalidation, |
|
299 |
buff += getYearInt(metadata.getDateofvalidation().getValue(), DELIM, ENCLOSING); |
|
300 |
; |
|
301 |
|
|
302 |
//website |
|
303 |
buff += getStringField(metadata.getWebsiteurl().getValue(), DELIM, ENCLOSING); |
|
304 |
; |
|
305 |
|
|
306 |
//harvested |
|
307 |
buff += getStringField("false", DELIM, ENCLOSING); |
|
308 |
; |
|
309 |
|
|
310 |
// deletedByInference |
|
311 |
buff += getStringField(String.valueOf(oaf.getDataInfo().getDeletedbyinference()), DELIM, ENCLOSING); |
|
312 |
; |
|
313 |
|
|
314 |
return buff; |
|
315 |
} |
|
316 |
|
|
317 |
private static String buildOrganization(Oaf oaf, String DELIM, String ENCLOSING) { |
|
318 |
OafEntity data = oaf.getEntity(); |
|
319 |
String buff = new String(); |
|
320 |
|
|
321 |
Organization organization = data.getOrganization(); |
|
322 |
Organization.Metadata metadata = organization.getMetadata(); |
|
323 |
|
|
324 |
// `name`, |
|
325 |
buff += getStringField(metadata.getLegalname().getValue(), DELIM, ENCLOSING); |
|
326 |
; |
|
327 |
// `country`, |
|
328 |
buff += getStringField(metadata.getCountry().getClassname(), DELIM, ENCLOSING); |
|
329 |
; |
|
330 |
|
|
331 |
//website |
|
332 |
buff += getStringField(metadata.getWebsiteurl().getValue(), DELIM, ENCLOSING); |
|
333 |
; |
|
334 |
|
|
335 |
|
|
336 |
// deletedByInference |
|
337 |
buff += getStringField(String.valueOf(oaf.getDataInfo().getDeletedbyinference()), DELIM, ENCLOSING); |
|
338 |
; |
|
339 |
|
|
340 |
|
|
341 |
return buff; |
|
342 |
|
|
343 |
} |
|
344 |
|
|
345 |
private static String buildResult(Oaf oaf, String DELIM, String ENCLOSING) { |
|
346 |
OafEntity data = oaf.getEntity(); |
|
347 |
String buff = new String(); |
|
348 |
|
|
349 |
Result result = data.getResult(); |
|
350 |
Result.Metadata metadata = result.getMetadata(); |
|
351 |
|
|
352 |
// pubtitle |
|
353 |
|
|
354 |
String titleString = new String(); |
|
355 |
|
|
356 |
|
|
357 |
for (int i = 0; i < metadata.getTitleList().size(); i++) { |
|
358 |
StructuredProperty title = metadata.getTitleList().get(i); |
|
359 |
if (i == 0) { |
|
360 |
titleString = title.getValue().replaceAll("\\s+", " "); |
|
361 |
titleString = titleString.replaceAll("\n", " "); |
|
362 |
} |
|
363 |
break; |
|
364 |
} |
|
365 |
// pubtitle |
|
366 |
buff += getStringField(titleString, DELIM, ENCLOSING); //!#Osvaldo Raineri: Gli Atti etiopici del martire egiziano Giorgio il nuovo (β 978)#!# |
|
367 |
|
|
368 |
// format |
|
369 |
String formatString = new String(); |
|
370 |
|
|
371 |
for (StringField format : metadata.getFormatList()) { |
|
372 |
formatString += format.getValue() + ";"; |
|
373 |
|
|
374 |
} |
|
375 |
|
|
376 |
buff += getStringField(formatString, DELIM, ENCLOSING); |
|
377 |
// publisher |
|
378 |
|
|
379 |
buff += getStringField(metadata.getPublisher().getValue(), DELIM, ENCLOSING); |
|
380 |
// journal |
|
381 |
|
|
382 |
buff += getStringField(metadata.getJournal().getName().replaceAll("\n", " "), DELIM, ENCLOSING); //#null#! |
|
383 |
|
|
384 |
// year |
|
385 |
buff += getYearInt(metadata.getDateofacceptance().getValue(), DELIM, ENCLOSING); |
|
386 |
|
|
387 |
// date CHANGED THIS TO DATE FORMAT |
|
388 |
buff += getStringDateField(metadata.getDateofacceptance().getValue(), DELIM, ENCLOSING); |
|
389 |
|
|
390 |
// access_mode, |
|
391 |
buff += getStringField(getAccessMode(result), DELIM, ENCLOSING); |
|
392 |
|
|
393 |
// bestlicense |
|
394 |
|
|
395 |
buff += getStringField(getBestLicense(result), DELIM, ENCLOSING); |
|
396 |
; |
|
397 |
// type |
|
398 |
buff += getStringField(metadata.getResulttype().getClassname(), DELIM, ENCLOSING); |
|
399 |
// embargo_end_date |
|
400 |
buff += getStringField(metadata.getEmbargoenddate().getValue(), DELIM, ENCLOSING); |
|
401 |
|
|
402 |
// `authors`, |
|
403 |
int authors = 0; |
|
404 |
String delayed = "no"; |
|
405 |
|
|
406 |
for (OafRel rel : data.getCachedRelList()) { |
|
407 |
|
|
408 |
if (rel.getRelType().equals(RelType.personResult)) { |
|
409 |
|
|
410 |
authors++; |
|
411 |
} else if (rel.getRelType().equals(RelType.resultProject)) |
|
412 |
// TODO remember : in result Project, first id is project, second is |
|
413 |
// result. |
|
414 |
{ |
|
415 |
|
|
416 |
String daysfromend = getYearDifferenceInteger(rel.getResultProject().getOutcome().getRelMetadata().getEnddate(), |
|
417 |
rel.getResultProject().getOutcome().getRelMetadata().getStartdate(), DELIM, ENCLOSING); |
|
418 |
if (Integer.parseInt(daysfromend) > 0) { |
|
419 |
delayed = "yes"; |
|
420 |
} |
|
421 |
} |
|
422 |
} |
|
423 |
// `delayed`, |
|
424 |
buff += getStringField(delayed, DELIM, ENCLOSING); |
|
425 |
|
|
426 |
buff += getNumericField(String.valueOf(authors), DELIM, ENCLOSING); |
|
427 |
|
|
428 |
|
|
429 |
// deletedByInference |
|
430 |
buff += getStringField(String.valueOf(oaf.getDataInfo().getDeletedbyinference()), DELIM, ENCLOSING); |
|
431 |
|
|
432 |
|
|
433 |
return buff; |
|
434 |
|
|
435 |
} |
|
436 |
|
|
437 |
|
|
438 |
private static String getBestLicense(Result result) { |
|
439 |
Qualifier bestLicense = null; |
|
440 |
LicenseComparator lc = new LicenseComparator(); |
|
441 |
for (Instance instance : (result.getInstanceList())) { |
|
442 |
if (lc.compare(bestLicense, instance.getLicence()) > 0) { |
|
443 |
bestLicense = instance.getLicence(); |
|
444 |
} |
|
445 |
} |
|
446 |
if (bestLicense != null) { |
|
447 |
return bestLicense.getClassname(); |
|
448 |
} else { |
|
449 |
return null; |
|
450 |
} |
|
451 |
} |
|
452 |
|
|
453 |
// TODO here iterate over all values |
|
454 |
private static String getAccessMode(Result result) { |
|
455 |
String accessMode = " "; |
|
456 |
for (Instance instance : (result.getInstanceList())) { |
|
457 |
if (instance.getLicence().getClassname() != null && !instance.getLicence().getClassname().isEmpty()) { |
|
458 |
accessMode = instance.getLicence().getClassname(); |
|
459 |
break; |
|
460 |
} |
|
461 |
|
|
462 |
} |
|
463 |
|
|
464 |
return accessMode; |
|
465 |
} |
|
466 |
|
|
467 |
private static String buildProject(Oaf oaf, String DELIM, String ENCLOSING) { |
|
468 |
OafEntity data = oaf.getEntity(); |
|
469 |
String buff = new String(); |
|
470 |
|
|
471 |
Project project = data.getProject(); |
|
472 |
Project.Metadata metadata = project.getMetadata(); |
|
473 |
|
|
474 |
|
|
475 |
// `acronym`, |
|
476 |
String acronym = metadata.getAcronym().getValue(); |
|
477 |
if (acronym.equalsIgnoreCase("UNKNOWN")) { |
|
478 |
acronym = metadata.getTitle().getValue(); |
|
479 |
} |
|
480 |
|
|
481 |
buff += getStringField(acronym, DELIM, ENCLOSING); |
|
482 |
|
|
483 |
//title! |
|
484 |
String title = getStringField(metadata.getTitle().getValue(), DELIM, ENCLOSING); |
|
485 |
buff += getStringField(title, DELIM, ENCLOSING); |
|
486 |
|
|
487 |
List<StringField> fundList = metadata.getFundingtreeList(); |
|
488 |
|
|
489 |
|
|
490 |
if (!fundList.isEmpty()) // `funding_lvl0`, |
|
491 |
{ |
|
492 |
//TODO funder + 3 funding levels |
|
493 |
/* funder text, |
|
494 |
funding_lvl0 text, |
|
495 |
funding_lvl1 text, |
|
496 |
funding_lvl2 text, |
|
497 |
funding_lvl3 text,*/ |
|
498 |
buff += FundingParser.getFundingInfo(fundList.get(0).getValue(), DELIM, ENCLOSING); |
|
499 |
; |
|
500 |
|
|
501 |
} else { |
|
502 |
|
|
503 |
buff += FundingParser.getFundingInfo("", DELIM, ENCLOSING); |
|
504 |
; |
|
505 |
} |
|
506 |
|
|
507 |
|
|
508 |
String sc39 = metadata.getEcsc39().getValue().toString(); |
|
509 |
if (sc39.equalsIgnoreCase("true") || sc39.equalsIgnoreCase("t") || sc39.contains("yes")) { |
|
510 |
sc39 = "yes"; |
|
511 |
} else if (sc39.equalsIgnoreCase("false") || sc39.equalsIgnoreCase("f") || sc39.contains("no")) { |
|
512 |
sc39 = "no"; |
|
513 |
} |
|
514 |
|
|
515 |
buff += getStringField(sc39, DELIM, ENCLOSING); |
|
516 |
|
|
517 |
// `url`, |
|
518 |
buff += getStringField(metadata.getWebsiteurl().getValue(), DELIM, ENCLOSING); |
|
519 |
|
|
520 |
// start_year |
|
521 |
|
|
522 |
buff += getYearInt(metadata.getStartdate().getValue(), DELIM, ENCLOSING); |
|
523 |
|
|
524 |
// end_year |
|
525 |
buff += getYearInt(metadata.getEnddate().getValue(), DELIM, ENCLOSING); |
|
526 |
|
|
527 |
// duration enddate-startdate |
|
528 |
|
|
529 |
buff += getYearDifferenceInteger(metadata.getEnddate().getValue(), metadata.getStartdate().getValue(), DELIM, ENCLOSING); |
|
530 |
|
|
531 |
// haspubs |
|
532 |
buff += getStringField("no", DELIM, ENCLOSING); |
|
533 |
|
|
534 |
// numpubs |
|
535 |
buff += getNumericField("0", DELIM, ENCLOSING); |
|
536 |
// enddate |
|
537 |
buff += getNumericField(metadata.getEnddate().getValue(), DELIM, ENCLOSING); |
|
538 |
// startdate |
|
539 |
buff += getNumericField(metadata.getStartdate().getValue(), DELIM, ENCLOSING); |
|
540 |
|
|
541 |
// `daysforlastpub`, |
|
542 |
buff += getNumericField("", DELIM, ENCLOSING); |
|
543 |
// `delayedpubs`, |
|
544 |
buff += getNumericField("", DELIM, ENCLOSING); |
|
545 |
|
|
546 |
//call identifier |
|
547 |
buff += getStringField(metadata.getCallidentifier().getValue(), DELIM, ENCLOSING); |
|
548 |
//code |
|
549 |
buff += getStringField(metadata.getCode().getValue(), DELIM, ENCLOSING); |
|
550 |
//esc39 |
|
551 |
buff += getStringField(metadata.getEcsc39().getValue(), DELIM, ENCLOSING); |
|
552 |
|
|
553 |
|
|
554 |
// deletedByInference |
|
555 |
buff += getStringField(String.valueOf(oaf.getDataInfo().getDeletedbyinference()), DELIM, ENCLOSING); |
|
556 |
return buff; |
|
557 |
|
|
558 |
} |
|
559 |
|
|
560 |
private static String buildPerson(Oaf oaf, String DELIM, String ENCLOSING) { |
|
561 |
OafEntity data = oaf.getEntity(); |
|
562 |
String buff = new String(); |
|
563 |
|
|
564 |
PersonProtos.Person person = data.getPerson(); |
|
565 |
PersonProtos.Person.Metadata metadata = person.getMetadata(); |
|
566 |
|
|
567 |
//firstName |
|
568 |
buff += getStringField(metadata.getFirstname().getValue(), DELIM, ENCLOSING); |
|
569 |
//LastName |
|
570 |
|
|
571 |
String secondName = new String(); |
|
572 |
for (StringField s : metadata.getSecondnamesList()) { |
|
573 |
secondName += s.getValue() + " "; |
|
574 |
} |
|
575 |
|
|
576 |
buff += getStringField(secondName, DELIM, ENCLOSING); |
|
577 |
|
|
578 |
// `fullname`, |
|
579 |
buff += getStringField(metadata.getFullname().getValue(), DELIM, ENCLOSING); |
|
580 |
|
|
581 |
// `Nationality`, |
|
582 |
buff += getStringField(metadata.getNationality().getClassname(), DELIM, ENCLOSING); |
|
583 |
// `Email`, |
|
584 |
buff += getStringField(metadata.getEmail().getValue(), DELIM, ENCLOSING); |
|
585 |
// `Phone`, |
|
586 |
buff += getStringField(metadata.getPhone().getValue(), DELIM, ENCLOSING); |
|
587 |
// deletedByInference |
|
588 |
buff += getStringField(String.valueOf(oaf.getDataInfo().getDeletedbyinference()), DELIM, ENCLOSING); |
|
589 |
|
|
590 |
return buff; |
|
591 |
|
|
592 |
} |
|
593 |
|
|
594 |
|
|
595 |
private static String getYearDifferenceInteger(String enddate, String startdate, String DELIM, String ENCLOSING) { |
|
596 |
|
|
597 |
if (enddate != null && !enddate.isEmpty() && startdate != null && !startdate.isEmpty()) { |
|
598 |
|
|
599 |
String[] split = startdate.split("-"); |
|
600 |
|
|
601 |
if (split == null || split.length == 0) { |
|
602 |
return ENCLOSING + "0" + ENCLOSING + DELIM; |
|
603 |
} |
|
604 |
|
|
605 |
int Startdate = Integer.parseInt(split[0]); |
|
606 |
|
|
607 |
split = enddate.split("-"); |
|
608 |
|
|
609 |
if (split == null || split.length == 0) { |
|
610 |
return ENCLOSING + "0" + ENCLOSING + DELIM; |
|
611 |
} |
|
612 |
|
|
613 |
int Enddate = Integer.parseInt(split[0]); |
|
614 |
|
|
615 |
int diff = Enddate - Startdate; |
|
616 |
|
|
617 |
return ENCLOSING + diff + ENCLOSING + DELIM; |
|
618 |
|
|
619 |
} |
|
620 |
|
|
621 |
return ENCLOSING + "0" + ENCLOSING + DELIM; |
|
622 |
} |
|
623 |
|
|
624 |
private static String getYearInt(String data, String DELIM, String ENCLOSING) { |
|
625 |
if (data == null || data.isEmpty() || data.equals("-1")) { |
|
626 |
return ENCLOSING + "0" + ENCLOSING + DELIM; |
|
627 |
} |
|
628 |
|
|
629 |
String[] split = data.split("-"); |
|
630 |
|
|
631 |
if (split == null || split.length == 0) { |
|
632 |
return ENCLOSING + "0" + ENCLOSING + DELIM; |
|
633 |
} |
|
634 |
|
|
635 |
String year = split[0]; |
|
636 |
|
|
637 |
year = cleanNumber(year); |
|
638 |
|
|
639 |
return ENCLOSING + year + ENCLOSING + DELIM; |
|
640 |
|
|
641 |
|
|
642 |
} |
|
643 |
|
|
644 |
private static String cleanNumber(String number) { |
|
645 |
number = number.replaceAll("[^A-Za-z0-9:,____]", ""); |
|
646 |
|
|
647 |
return number; |
|
648 |
} |
|
649 |
|
|
650 |
private static String getStringField(String data, String DELIM, String ENCLOSING) { |
|
651 |
|
|
652 |
if (data == null || data.isEmpty()) { |
|
653 |
|
|
654 |
return ENCLOSING + " " + ENCLOSING + DELIM; |
|
655 |
} else { |
|
656 |
String field = clean(data, DELIM, ENCLOSING); |
|
657 |
|
|
658 |
if (field == null) { |
|
659 |
return ENCLOSING + " " + ENCLOSING + DELIM; |
|
660 |
} else { |
|
661 |
|
|
662 |
return ENCLOSING + field + ENCLOSING + DELIM; |
|
663 |
} |
|
664 |
} |
|
665 |
} |
|
666 |
|
|
667 |
private static String getStringDateField(String data, String DELIM, String ENCLOSING) { |
|
668 |
|
|
669 |
if (data == null || data.isEmpty() || data.equals("") || data.equals("-1")) { |
|
670 |
|
|
671 |
return ENCLOSING + "0" + ENCLOSING + DELIM; |
|
672 |
} else { |
|
673 |
|
|
674 |
String field = clean(data, DELIM, ENCLOSING); |
|
675 |
if (field == null) { |
|
676 |
return ENCLOSING + "0" + ENCLOSING + DELIM; |
|
677 |
} else { |
|
678 |
return ENCLOSING + field + ENCLOSING + DELIM; |
|
679 |
} |
|
680 |
} |
|
681 |
} |
|
682 |
|
|
683 |
private static String getNumericField(String data, String DELIM, String ENCLOSING) { |
|
684 |
if (data == null || data.isEmpty() || data.equals("")) { |
|
685 |
return ENCLOSING + "0" + ENCLOSING + DELIM; |
|
686 |
} else { |
|
687 |
return ENCLOSING + data + ENCLOSING + DELIM; |
|
688 |
} |
|
689 |
} |
|
690 |
|
|
691 |
public static String getId(Oaf oaf, String DELIM, String ENCLOSING) { |
|
692 |
switch (oaf.getKind()) { |
|
693 |
case entity: |
|
694 |
|
|
695 |
return cleanId(oaf.getEntity().getId(), DELIM, ENCLOSING); |
|
696 |
case relation: |
|
697 |
|
|
698 |
return cleanId(oaf.getRel().getSource(), DELIM, ENCLOSING); |
|
699 |
|
|
700 |
} |
|
701 |
return null; |
|
702 |
|
|
703 |
} |
|
704 |
|
|
705 |
public static String getId(OafRel relOaf, String DELIM, String ENCLOSING) { |
|
706 |
return cleanId(relOaf.getSource(), DELIM, ENCLOSING); |
|
707 |
} |
|
708 |
|
|
709 |
public static String clean(String value, String DELIM, String ENCLOSING) { |
|
710 |
if (value != null) { |
|
711 |
// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____:: |
|
712 |
// to datacite____:: ) |
|
713 |
// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA |
|
714 |
value = value.replaceFirst(".*\\|", ""); |
|
715 |
|
|
716 |
value = value.replace(DELIM, " "); |
|
717 |
value = value.replaceAll("'", " "); |
|
718 |
value = value.replace("#", " "); |
|
719 |
|
|
720 |
value = value.replaceAll("\\r\\n|\\r|\\n", " "); |
|
721 |
value = value.replaceAll("\\s+", " "); |
|
722 |
value = value.replaceAll("(\\r|\\n)", " "); |
|
723 |
value = value.replaceAll("\\t", " "); |
|
724 |
value = value.replace("\n", " "); |
|
725 |
|
|
726 |
value = value.replaceAll("[\"\\r\\\\;]", ""); |
|
727 |
|
|
728 |
value = value.replace("\"", " "); |
|
729 |
|
|
730 |
value = value.replaceAll("[^a-zA-Z0-9 .-_:/!@+=]+", " "); |
|
731 |
|
|
732 |
} |
|
733 |
if (value == null) { |
|
734 |
return null; |
|
735 |
} |
|
736 |
return value; |
|
737 |
|
|
738 |
} |
|
739 |
|
|
740 |
public static String cleanId(String value, String DELIM, String ENCLOSING) { |
|
741 |
if (value != null) { |
|
742 |
// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____:: |
|
743 |
|
|
744 |
// to datacite____:: ) |
|
745 |
// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA |
|
746 |
value = value.replaceFirst(".*\\|", ""); |
|
747 |
value = value.replace("\n", ""); |
|
748 |
value = value.replace(DELIM, ""); |
|
749 |
value = value.replace("'", " "); |
|
750 |
value = value.replace(ENCLOSING, " "); |
|
751 |
|
|
752 |
|
|
753 |
value = value.trim(); |
|
754 |
|
|
755 |
} |
|
756 |
if (value == null) { |
|
757 |
return null; |
|
758 |
} |
|
759 |
return ENCLOSING + value + ENCLOSING; |
|
760 |
|
|
761 |
} |
|
762 |
|
|
763 |
public static long DATEDIFF(String startDate, String endDate) { |
|
764 |
long MILLISECS_PER_DAY = 24 * 60 * 60 * 1000; |
|
765 |
long days = 0l; |
|
766 |
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); // "dd/MM/yyyy HH:mm:ss"); |
|
767 |
// <startdate>2011-09-01</startdate> |
|
768 |
// <enddate>2015-08-31</enddate> |
|
769 |
Date dateIni = null; |
|
770 |
Date dateFin = null; |
|
771 |
|
|
772 |
if (startDate == null || startDate.isEmpty() || endDate == null || endDate.isEmpty()) { |
|
773 |
return 0; |
|
774 |
} |
|
775 |
try { |
|
776 |
dateIni = (Date) format.parse(startDate); |
|
777 |
dateFin = (Date) format.parse(endDate); |
|
778 |
days = (dateFin.getTime() - dateIni.getTime()) / MILLISECS_PER_DAY; |
|
779 |
} catch (Exception e) { |
|
780 |
|
|
781 |
return 0; |
|
782 |
} |
|
783 |
|
|
784 |
return days; |
|
785 |
} |
|
786 |
|
|
787 |
|
|
788 |
} |
|
0 | 789 |
modules/dnet-openaire-stats/branches/full_export/src/main/java/eu/dnetlib/data/mapreduce/hbase/statsExport/utils/FundingParser.java | ||
---|---|---|
7 | 7 |
*/ |
8 | 8 |
|
9 | 9 |
public class FundingParser { |
10 |
private String NULL_STRING; |
|
11 |
private String ENCLOSED; |
|
12 |
private String DELIM ; |
|
13 | 10 |
|
14 | 11 |
|
15 |
public FundingParser(String delim, String enclosed, String nullString) |
|
16 |
{ |
|
17 |
this.setNULL_STRING(nullString); |
|
18 |
this.setENCLOSED(enclosed); |
|
19 |
this.setDELIM(delim); |
|
12 |
public static String getFundingLevel(String funding_level, int level, String DELIM, String ENCLOSING) { |
|
20 | 13 |
|
21 |
} |
|
14 |
if (funding_level.isEmpty()) { |
|
15 |
return ENCLOSING + " " + ENCLOSING + DELIM; |
|
16 |
} |
|
22 | 17 |
|
23 |
private Logger log = Logger.getLogger(this.getClass()); |
|
18 |
if (!funding_level.contains("<funding_level_" + level + ">")) { |
|
19 |
return ENCLOSING + " " + ENCLOSING + DELIM; |
|
20 |
} |
|
24 | 21 |
|
25 |
String getFundingLevel(String funding_level, int level) {
|
|
22 |
String[] split = funding_level.split("<funding_level_" + level + ">");
|
|
26 | 23 |
|
27 |
if (funding_level.isEmpty()) { |
|
28 |
return ENCLOSED + NULL_STRING + ENCLOSED + DELIM; |
|
29 |
} |
|
24 |
funding_level = split[1]; |
|
30 | 25 |
|
31 |
if (!funding_level.contains("<funding_level_" + level + ">")) { |
|
32 |
return ENCLOSED + NULL_STRING + ENCLOSED + DELIM; |
|
33 |
} |
|
26 |
split = funding_level.split("<name>"); |
|
27 |
funding_level = split[1]; |
|
34 | 28 |
|
35 |
String[] split = funding_level.split("<funding_level_" + level + ">"); |
|
36 |
|
|
37 |
funding_level = split[1]; |
|
38 |
|
|
39 |
split = funding_level.split("<name>"); |
|
40 |
funding_level = split[1]; |
|
41 |
|
|
42 |
funding_level = funding_level.substring(0, funding_level.indexOf("</name>")); |
|
43 |
funding_level = funding_level.replaceAll("\"", ""); |
|
29 |
funding_level = funding_level.substring(0, funding_level.indexOf("</name>")); |
|
30 |
funding_level = funding_level.replaceAll("\"", ""); |
|
44 | 31 |
funding_level = funding_level.replaceAll("/>", ""); |
45 | 32 |
funding_level = funding_level.replaceAll("<", ""); |
46 |
funding_level = funding_level.replaceAll("&", "");
|
|
33 |
funding_level = funding_level.replaceAll("&", "");
|
|
47 | 34 |
|
48 |
if (level == 1) {
|
|
49 |
if (funding_level.equalsIgnoreCase("SP1")) {
|
|
50 |
funding_level = "SP1-Cooperation";
|
|
51 |
} else if (funding_level.equalsIgnoreCase("SP2")) {
|
|
52 |
funding_level = "SP2-Ideas";
|
|
53 |
}
|
|
54 |
if (funding_level.equalsIgnoreCase("SP3")) {
|
|
55 |
funding_level = "SP3-People";
|
|
56 |
} else if (funding_level.equalsIgnoreCase("SP4")) {
|
|
57 |
funding_level = "SP4-Capacities";
|
|
35 |
if (level == 1) {
|
|
36 |
if (funding_level.equalsIgnoreCase("SP1")) {
|
|
37 |
funding_level = "SP1-Cooperation";
|
|
38 |
} else if (funding_level.equalsIgnoreCase("SP2")) {
|
|
39 |
funding_level = "SP2-Ideas";
|
|
40 |
}
|
|
41 |
if (funding_level.equalsIgnoreCase("SP3")) {
|
|
42 |
funding_level = "SP3-People";
|
|
43 |
} else if (funding_level.equalsIgnoreCase("SP4")) {
|
|
44 |
funding_level = "SP4-Capacities";
|
|
58 | 45 |
|
59 |
} else if (funding_level.equalsIgnoreCase("SP5")) {
|
|
60 |
funding_level = "SP5-Euratom";
|
|
61 |
}
|
|
62 |
}
|
|
46 |
} else if (funding_level.equalsIgnoreCase("SP5")) {
|
|
47 |
funding_level = "SP5-Euratom";
|
|
48 |
}
|
|
49 |
}
|
|
63 | 50 |
|
64 | 51 |
|
65 |
funding_level = funding_level.replaceAll(">", "");
|
|
52 |
funding_level = funding_level.replaceAll(">", "");
|
|
66 | 53 |
|
67 |
funding_level = funding_level.replaceAll("</", "");
|
|
68 |
funding_level=funding_level.replace(DELIM," ");
|
|
69 |
funding_level=funding_level.replace(ENCLOSED,"");
|
|
54 |
funding_level = funding_level.replaceAll("</", "");
|
|
55 |
funding_level = funding_level.replace(DELIM, " ");
|
|
56 |
funding_level = funding_level.replace(ENCLOSING, " ");
|
|
70 | 57 |
|
58 |
return ENCLOSING + funding_level + ENCLOSING + DELIM; |
|
59 |
} |
|
71 | 60 |
|
72 |
return ENCLOSED + funding_level + ENCLOSED + DELIM; |
|
73 |
} |
|
61 |
public static String getFundingInfo(String buff, String DELIM, String ENCLOSING) { |
|
62 |
return getFunder(buff, DELIM, ENCLOSING) + |
|
63 |
getFundingLevel(buff, 0, DELIM, ENCLOSING) + (getFundingLevel(buff, 1, DELIM, ENCLOSING) + getFundingLevel(buff, 2, DELIM, ENCLOSING) |
|
64 |
+ getFundingLevel(buff, 3, DELIM, ENCLOSING)); |
|
65 |
} |
|
74 | 66 |
|
75 |
public String getFundingInfo(String buff) { |
|
76 |
return getFunder(buff) + getFundingLevel(buff, 0) + (getFundingLevel(buff, 1) + getFundingLevel(buff, 2) + getFundingLevel(buff, 3)); |
|
77 |
} |
|
67 |
public static String getFunder(String buff, String DELIM, String ENCLOSING) { |
|
78 | 68 |
|
79 |
public String getFunder(String buff) { |
|
69 |
if (buff.isEmpty()) { |
|
70 |
return ENCLOSING + " " + ENCLOSING + DELIM; |
|
80 | 71 |
|
72 |
} |
|
73 |
if (!buff.contains("<funder>")) { |
|
74 |
return ENCLOSING + " " + ENCLOSING + DELIM; |
|
75 |
} |
|
81 | 76 |
|
82 |
if (buff.isEmpty()) { |
|
83 |
return ENCLOSED + NULL_STRING + ENCLOSED + DELIM; |
|
77 |
String[] split = buff.split("<funder>"); |
|
78 |
String funder = split[1]; |
|
79 |
split = funder.split("<name>"); |
|
84 | 80 |
|
85 |
}
|
|
81 |
funder = split[1];
|
|
86 | 82 |
|
87 |
if (!buff.contains("<funder>")) { |
|
88 |
return ENCLOSED + NULL_STRING + ENCLOSED + DELIM; |
|
89 |
} |
|
90 |
String[] split = buff.split("<funder>"); |
|
91 |
String funder = split[1]; |
|
83 |
funder = funder.substring(0, funder.indexOf("</name>")); |
|
92 | 84 |
|
93 |
split = funder.split("<name>");
|
|
85 |
funder = funder.replaceAll(">", "");
|
|
94 | 86 |
|
95 |
funder = split[1];
|
|
87 |
funder = funder.replaceAll("</", "");
|
|
96 | 88 |
|
97 |
funder = funder.substring(0, funder.indexOf("</name>")); |
|
89 |
funder = funder.replaceAll("\"", ""); |
|
90 |
funder = funder.replaceAll("&", ""); |
|
91 |
funder = funder.replace(ENCLOSING, " "); |
|
98 | 92 |
|
99 |
funder = funder.replaceAll(">", ""); |
|
93 |
return ENCLOSING + funder + ENCLOSING + DELIM; |
|
94 |
} |
|
100 | 95 |
|
101 |
funder = funder.replaceAll("</", ""); |
|
102 |
|
|
103 |
funder = funder.replaceAll("\"", ""); |
|
104 |
funder = funder.replaceAll("&", ""); |
|
105 |
|
|
106 |
|
|
107 |
return ENCLOSED + funder + ENCLOSED + DELIM; |
|
108 |
} |
|
109 |
|
|
110 |
public String getNULL_STRING() { |
|
111 |
return NULL_STRING; |
|
112 |
} |
|
113 |
|
|
114 |
public void setNULL_STRING(String NULL_STRING) { |
|
115 |
this.NULL_STRING = NULL_STRING; |
|
116 |
} |
|
117 |
|
|
118 |
public String getENCLOSED() { |
|
119 |
return ENCLOSED; |
|
120 |
} |
|
121 |
|
|
122 |
public void setENCLOSED(String ENCLOSED) { |
|
123 |
this.ENCLOSED = ENCLOSED; |
|
124 |
} |
|
125 |
|
|
126 |
public String getDELIM() { |
|
127 |
return DELIM; |
|
128 |
} |
|
129 |
|
|
130 |
public void setDELIM(String DELIM) { |
|
131 |
this.DELIM = DELIM; |
|
132 |
} |
|
133 | 96 |
} |
134 | 97 |
|
modules/dnet-openaire-stats/branches/full_export/src/main/java/eu/dnetlib/data/mapreduce/hbase/statsExport/utils/Serializer.java | ||
---|---|---|
1 |
/* |
|
1 | 2 |
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils; |
2 | 3 |
|
4 |
import com.google.common.collect.ArrayListMultimap; |
|
5 |
import com.google.common.collect.Multimap; |
|
3 | 6 |
import eu.dnetlib.data.mapreduce.util.LicenseComparator; |
4 | 7 |
import eu.dnetlib.data.proto.DatasourceProtos.Datasource; |
5 | 8 |
import eu.dnetlib.data.proto.DatasourceProtos.Datasource.Metadata; |
... | ... | |
26 | 29 |
import java.util.HashMap; |
27 | 30 |
import java.util.List; |
28 | 31 |
|
32 |
*/ |
|
29 | 33 |
/** |
30 | 34 |
* @author eri Simple serializer that parses input Oaf Protos and prepares them |
31 | 35 |
* for sqoop |
32 |
*/ |
|
36 |
*//* |
|
37 |
|
|
33 | 38 |
public class Serializer { |
34 | 39 |
|
35 | 40 |
private static String DELIM; |
41 |
//TODO replae here later |
|
42 |
private String ENCLOSING = "'"; |
|
36 | 43 |
private Logger log = Logger.getLogger(this.getClass()); |
37 |
private String NULL_STRING; |
|
38 |
private String NULL_NUM; |
|
44 |
private String NULL_STRING = "";
|
|
45 |
private String NULL_NUM = "0";
|
|
39 | 46 |
//TODO no longer used |
47 |
*/ |
|
48 |
/* |
|
40 | 49 |
private static String ENCLOSED; |
50 |
*//* |
|
41 | 51 |
|
52 |
|
|
42 | 53 |
private FundingParser fundingParser; |
43 | 54 |
|
44 |
|
|
45 | 55 |
public Serializer(String delim, String nullNum, String nullString, String enclosed) { |
46 |
|
|
47 | 56 |
this.setDELIM(delim); |
48 |
this.setNULL_NUM(nullNum);
|
|
49 |
this.setENCLOSED(enclosed);
|
|
50 |
this.setNULL_STRING(nullString); |
|
57 |
*/
|
|
58 |
/*this.setNULL_NUM(nullNum);
|
|
59 |
this.setNULL_STRING(nullString);*//*
|
|
51 | 60 |
|
52 |
fundingParser=new FundingParser(delim, enclosed,nullString);
|
|
61 |
fundingParser = new FundingParser(delim, enclosed, nullString);
|
|
53 | 62 |
|
54 | 63 |
} |
55 | 64 |
|
... | ... | |
63 | 72 |
switch (valueEntity.getType()) { |
64 | 73 |
case datasource: |
65 | 74 |
|
66 |
return buildDatasource(valueEntity);
|
|
75 |
return buildDatasource(oaf);
|
|
67 | 76 |
|
68 | 77 |
case organization: |
69 | 78 |
|
70 |
return buildOrganization(valueEntity);
|
|
79 |
return buildOrganization(oaf);
|
|
71 | 80 |
|
72 | 81 |
case project: |
73 | 82 |
|
74 |
return buildProject(valueEntity);
|
|
75 |
case result: |
|
83 |
return buildProject(oaf);
|
|
84 |
case result:`
|
|
76 | 85 |
|
77 |
return buildResult(valueEntity);
|
|
86 |
return buildResult(oaf);
|
|
78 | 87 |
case person: |
79 |
return buildPerson(valueEntity);
|
|
88 |
return buildPerson(oaf);
|
|
80 | 89 |
default: |
81 | 90 |
log.error("wrong type"); |
82 | 91 |
break; |
... | ... | |
109 | 118 |
} |
110 | 119 |
|
111 | 120 |
|
112 |
public HashMap<String, List<String>> extractRelations(Oaf oaf) {
|
|
121 |
public Multimap<String, String> extractRelations(Oaf oaf) {
|
|
113 | 122 |
OafEntity valueEntity = oaf.getEntity(); |
114 |
HashMap<String, List<String>> relations = new HashMap<String, List<String>>(); |
|
115 |
relations.putAll(getOriginalId(valueEntity)); |
|
123 |
Multimap<String, String> relations = ArrayListMultimap.create(); |
|
116 | 124 |
|
125 |
getOriginalId(valueEntity, relations); |
|
126 |
|
|
117 | 127 |
switch (valueEntity.getType()) { |
118 | 128 |
case datasource: |
119 |
relations.putAll(getDatasourceLanguages(valueEntity));
|
|
129 |
getDatasourceLanguages(valueEntity, relations);
|
|
120 | 130 |
case result: |
121 |
relations.putAll(getResultTopics(valueEntity));
|
|
122 |
relations.putAll(getResultLanguages(valueEntity));
|
|
123 |
relations.putAll(getResultClassifications(valueEntity));
|
|
124 |
relations.putAll(getResultDatasources(valueEntity));
|
|
125 |
relations.putAll(getResultConcepts(valueEntity));
|
|
126 |
relations.putAll(getResultDois(valueEntity));
|
|
127 |
relations.putAll(getResultCitations(valueEntity));
|
|
128 |
relations.putAll(getResultDescriptions(valueEntity));
|
|
131 |
getResultTopics(valueEntity, relations);
|
|
132 |
getResultLanguages(valueEntity, relations);
|
|
133 |
getResultClassifications(valueEntity, relations);
|
|
134 |
getResultDatasources(valueEntity, relations);
|
|
135 |
getResultConcepts(valueEntity, relations);
|
|
136 |
getResultDois(valueEntity, relations);
|
|
137 |
getResultCitations(valueEntity, relations);
|
|
138 |
getResultDescriptions(valueEntity, relations);
|
|
129 | 139 |
return relations; |
130 | 140 |
|
131 | 141 |
case project: |
132 |
relations.putAll(getProjectKeywords(valueEntity));
|
|
133 |
relations.putAll(getProjectSubjects(valueEntity));
|
|
142 |
getProjectKeywords(valueEntity, relations);
|
|
143 |
getProjectSubjects(valueEntity, relations);
|
|
134 | 144 |
|
135 | 145 |
default: |
136 | 146 |
return null; |
... | ... | |
139 | 149 |
} |
140 | 150 |
|
141 | 151 |
|
152 |
private void getOriginalId(OafEntity oafEntity, Multimap<String, String> relations) { |
|
142 | 153 |
|
143 |
private HashMap<String, List<String>> getOriginalId(OafEntity oafEntity) { |
|
144 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
145 |
List<String> buffs = new ArrayList<String>(); |
|
146 |
|
|
147 |
|
|
154 |
String relName = oafEntity.getType().toString().toLowerCase() + "Oid"; |
|
148 | 155 |
for (String oid : oafEntity.getOriginalIdList()) { |
149 |
buffs.add(oid.replace(DELIM, " ").replace("\n", "") + DELIM);
|
|
156 |
relations.put(relName, ENCLOSING + oid.replace(DELIM, " ").replace("\n", "") + ENCLOSING + DELIM);
|
|
150 | 157 |
} |
151 | 158 |
|
152 |
rels.put(oafEntity.getType().toString().toLowerCase() + "Oid", buffs); |
|
153 |
|
|
154 |
return rels; |
|
155 | 159 |
} |
156 | 160 |
|
157 |
private HashMap<String, List<String>> getProjectKeywords(OafEntity oafEntity) { |
|
158 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
159 |
List<String> buffs = new ArrayList<String>(); |
|
161 |
private void getProjectKeywords(OafEntity oafEntity, Multimap<String, String> relations) { |
|
162 |
relations.put("projectKeyword", getStringField(oafEntity.getProject().getMetadata().getKeywords().getValue())); |
|
160 | 163 |
|
161 |
// String[] keywords = oafEntity.getProject().getMetadata().getKeywords().getValue().replace(";", ",").split(","); |
|
162 |
|
|
163 |
/* for (String keyword : keywords) { |
|
164 |
buffs.add(getStringField(keyword)); |
|
165 |
}*/ |
|
166 |
|
|
167 |
|
|
168 |
buffs.add(getStringField(oafEntity.getProject().getMetadata().getKeywords().getValue())); |
|
169 |
rels.put("projectKeyword", buffs); |
|
170 |
return rels; |
|
171 | 164 |
} |
172 | 165 |
|
173 |
private HashMap<String, List<String>> getProjectSubjects(OafEntity oafEntity) { |
|
174 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
175 |
List<String> buffs = new ArrayList<String>(); |
|
176 |
|
|
177 |
|
|
166 |
private void getProjectSubjects(OafEntity oafEntity, Multimap<String, String> relations) { |
|
178 | 167 |
for (StructuredProperty subj : oafEntity.getProject().getMetadata().getSubjectsList()) { |
179 |
buffs.add(getStringField(subj.getValue()));
|
|
168 |
relations.put("projectSubject", getStringField(subj.getValue()));
|
|
180 | 169 |
} |
181 |
|
|
182 |
rels.put("projectSubject", buffs); |
|
183 |
|
|
184 |
return rels; |
|
185 | 170 |
} |
186 | 171 |
|
187 |
|
|
188 | 172 |
private String getResultProject(OafRel oaf) { |
189 | 173 |
String buff = new String(); |
190 | 174 |
String result = oaf.getTarget(); |
191 | 175 |
|
192 |
|
|
193 | 176 |
buff += getStringField(result); |
194 | 177 |
// TODO is declared as int!!! |
195 | 178 |
long diff = DATEDIFF(oaf.getResultProject().getOutcome().getRelMetadata().getEnddate(), oaf.getResultProject().getOutcome().getRelMetadata().getStartdate()); |
... | ... | |
201 | 184 |
} |
202 | 185 |
|
203 | 186 |
|
204 |
private HashMap<String, List<String>> getDatasourceLanguages(OafEntity valueEntity) { |
|
205 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
206 |
List<String> buffs = new ArrayList<String>(); |
|
207 |
|
|
208 |
|
|
187 |
private void getDatasourceLanguages(OafEntity valueEntity, Multimap<String, String> rels) { |
|
209 | 188 |
Datasource d = valueEntity.getDatasource(); |
210 |
|
|
211 | 189 |
Metadata metadata = d.getMetadata(); |
212 | 190 |
|
213 | 191 |
for (StringField lang : metadata.getOdlanguagesList()) { |
214 |
|
|
215 |
buffs.add(getStringField(lang.getValue())); |
|
192 |
rels.put("datasourceLanguage", getStringField(lang.getValue())); |
|
216 | 193 |
} |
217 |
rels.put("datasourceLanguage", buffs); |
|
218 |
return rels; |
|
219 | 194 |
} |
220 | 195 |
|
221 |
private HashMap<String, List<String>> getResultLanguages(OafEntity valueEntity) { |
|
222 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
223 |
List<String> buffs = new ArrayList<String>(); |
|
196 |
private void getResultLanguages(OafEntity valueEntity, Multimap<String, String> rels) { |
|
197 |
|
|
224 | 198 |
Result d = valueEntity.getResult(); |
225 |
|
|
226 | 199 |
eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = d.getMetadata(); |
227 | 200 |
|
228 | 201 |
if (metadata.getLanguage().getClassname() != null && !metadata.getLanguage().getClassname().isEmpty()) { |
229 | 202 |
|
230 |
buffs.add(getStringField(metadata.getLanguage().getClassname()));
|
|
203 |
rels.put("resultLanguage", getStringField(metadata.getLanguage().getClassname()));
|
|
231 | 204 |
} |
232 |
rels.put("resultLanguage", buffs); |
|
233 |
return rels; |
|
234 | 205 |
|
235 | 206 |
} |
236 | 207 |
|
208 |
private void getResultDois(OafEntity valueEntity, Multimap<String, String> rels) { |
|
237 | 209 |
|
238 |
private HashMap<String, List<String>> getResultDois(OafEntity valueEntity) { |
|
239 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
240 |
List<String> buffs = new ArrayList<String>(); |
|
241 |
|
|
242 | 210 |
for (StructuredProperty pid : valueEntity.getPidList()) { |
243 |
buffs.add(getStringField(pid.getQualifier().getClassname()) + getStringField(pid.getValue())); |
|
244 | 211 |
|
212 |
rels.put("resultPid", |
|
213 |
getStringField(pid.getQualifier().getClassname()) + getStringField(pid.getValue())); |
|
245 | 214 |
} |
246 |
|
|
247 |
rels.put("resultPid", buffs); |
|
248 |
return rels; |
|
249 |
|
|
250 | 215 |
} |
251 | 216 |
|
252 |
private HashMap<String, List<String>> getResultClassifications(OafEntity valueEntity) {
|
|
217 |
private void getResultClassifications(OafEntity valueEntity, Multimap<String, String> rels) {
|
|
253 | 218 |
|
254 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
255 |
List<String> buffs = new ArrayList<String>(); |
|
256 | 219 |
Result result = valueEntity.getResult(); |
257 | 220 |
|
258 | 221 |
for (Instance instance : (result.getInstanceList())) { |
259 | 222 |
String classification = instance.getInstancetype().getClassname(); |
260 | 223 |
if (classification != null && !classification.isEmpty()) { |
261 |
buffs.add(getStringField(instance.getInstancetype().getClassname()));
|
|
224 |
rels.put("resultClassification", getStringField(instance.getInstancetype().getClassname()));
|
|
262 | 225 |
// TODO HERE KEEP ONLY ONE CLASSIFICATIONS PER RESULT |
263 | 226 |
break; |
264 | 227 |
} |
265 | 228 |
} |
266 |
rels.put("resultClassification", buffs); |
|
267 |
return rels; |
|
268 |
|
|
269 | 229 |
} |
270 | 230 |
|
271 |
private HashMap<String, List<String>> getResultDescriptions(OafEntity valueEntity) { |
|
231 |
private void getResultDescriptions(OafEntity valueEntity, Multimap<String, String> rels) { |
|
232 |
Result result = valueEntity.getResult(); |
|
233 |
//description |
|
272 | 234 |
|
273 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
274 |
List<String> buffs = new ArrayList<String>(); |
|
275 |
Result result = valueEntity.getResult(); |
|
235 |
for (StringField s : result.getMetadata().getDescriptionList()) { |
|
276 | 236 |
|
237 |
rels.put("resultDescription", getStringField(Jsoup.parse(s.getValue()).text())); |
|
238 |
} |
|
239 |
} |
|
277 | 240 |
|
278 |
//description |
|
279 |
for (StringField s : result.getMetadata().getDescriptionList()) { |
|
280 |
buffs.add(getStringField( Jsoup.parse(s.getValue()).text())); |
|
281 |
} |
|
241 |
private void getResultConcepts(OafEntity valueEntity, Multimap<String, String> rels) { |
|
282 | 242 |
|
283 |
|
|
284 |
rels.put("resultDescription", buffs); |
|
285 |
return rels; |
|
286 |
|
|
287 |
} |
|
288 |
|
|
289 |
|
|
290 |
|
|
291 |
|
|
292 |
private HashMap<String, List<String>> getResultConcepts(OafEntity valueEntity) { |
|
293 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
294 |
List<String> buffs = new ArrayList<String>(); |
|
295 |
|
|
296 | 243 |
Result result = valueEntity.getResult(); |
297 | 244 |
|
298 | 245 |
for (eu.dnetlib.data.proto.ResultProtos.Result.Context context : result.getMetadata().getContextList()) { |
299 | 246 |
|
300 |
buffs.add(getStringField(context.getId()));
|
|
247 |
rels.put("resultConcept", getStringField(context.getId()));
|
|
301 | 248 |
} |
302 |
rels.put("resultConcept", buffs); |
|
303 |
return rels; |
|
304 |
|
|
305 | 249 |
} |
306 | 250 |
|
307 |
private HashMap<String, List<String>> getResultDatasources(OafEntity valueEntity) { |
|
308 | 251 |
|
309 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
310 |
List<String> buffs = new ArrayList<String>(); |
|
252 |
private void getResultDatasources(OafEntity valueEntity, Multimap<String, String> rels) { |
|
311 | 253 |
Result result = valueEntity.getResult(); |
312 | 254 |
|
313 | 255 |
//TODO hosted by |
314 | 256 |
for (Instance instance : (result.getInstanceList())) { |
315 | 257 |
String hostedBy = instance.getHostedby().getKey(); |
258 |
|
|
316 | 259 |
if (hostedBy != null && !hostedBy.isEmpty()) { |
317 |
buffs.add((getStringField(hostedBy)));
|
|
260 |
rels.put("resultDatasource", getStringField(hostedBy));
|
|
318 | 261 |
} |
319 | 262 |
} |
320 | 263 |
|
... | ... | |
322 | 265 |
for (FieldTypeProtos.KeyValue collectedFromValue : (valueEntity.getCollectedfromList())) { |
323 | 266 |
|
324 | 267 |
String collectedFrom = collectedFromValue.getKey(); |
325 |
if (collectedFrom != null && !collectedFrom.isEmpty()) buffs.add((getStringField(collectedFrom))); |
|
326 |
|
|
268 |
if (collectedFrom != null && !collectedFrom.isEmpty()) { |
|
269 |
rels.put("resultDatasource", getStringField(collectedFrom)); |
|
270 |
} |
|
327 | 271 |
} |
328 |
rels.put("resultDatasource", buffs); |
|
329 |
return rels; |
|
330 |
|
|
331 | 272 |
} |
332 | 273 |
|
333 |
private HashMap<String, List<String>> getResultTopics(OafEntity valueEntity) { |
|
334 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
Also available in: Unified diff
bundle commit