1 |
27955
|
claudio.at
|
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils;
|
2 |
|
|
|
3 |
42734
|
eri.katsar
|
import com.google.common.collect.Multimap;
|
4 |
47072
|
tsampikos.
|
|
5 |
27955
|
claudio.at
|
import eu.dnetlib.data.mapreduce.util.LicenseComparator;
|
6 |
|
|
import eu.dnetlib.data.proto.DatasourceProtos.Datasource;
|
7 |
|
|
import eu.dnetlib.data.proto.DatasourceProtos.Datasource.Metadata;
|
8 |
34194
|
eri.katsar
|
import eu.dnetlib.data.proto.FieldTypeProtos;
|
9 |
29712
|
eri.katsar
|
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
10 |
|
|
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
11 |
|
|
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
12 |
27955
|
claudio.at
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
13 |
|
|
import eu.dnetlib.data.proto.OafProtos.OafEntity;
|
14 |
|
|
import eu.dnetlib.data.proto.OafProtos.OafRel;
|
15 |
|
|
import eu.dnetlib.data.proto.OrganizationProtos.Organization;
|
16 |
|
|
import eu.dnetlib.data.proto.ProjectProtos.Project;
|
17 |
|
|
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
|
18 |
|
|
import eu.dnetlib.data.proto.ResultProtos.Result;
|
19 |
|
|
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
|
20 |
34084
|
eri.katsar
|
import org.apache.log4j.Logger;
|
21 |
27955
|
claudio.at
|
|
22 |
54431
|
tsampikos.
|
import java.text.DateFormat;
|
23 |
|
|
import java.text.ParseException;
|
24 |
34084
|
eri.katsar
|
import java.text.SimpleDateFormat;
|
25 |
|
|
import java.util.Date;
|
26 |
|
|
import java.util.List;
|
27 |
|
|
|
28 |
47072
|
tsampikos.
|
import org.w3c.dom.Element;
|
29 |
|
|
import org.w3c.dom.NodeList;
|
30 |
|
|
import org.xml.sax.InputSource;
|
31 |
|
|
import com.sun.org.apache.xerces.internal.parsers.DOMParser;
|
32 |
|
|
import org.w3c.dom.Document;
|
33 |
|
|
|
34 |
27955
|
claudio.at
|
/**
|
35 |
|
|
* @author eri Simple serializer that parses input Oaf Protos and prepares them
|
36 |
|
|
* for sqoop
|
37 |
|
|
*/
|
38 |
28471
|
eri.katsar
|
public class Serializer {
|
39 |
55644
|
antonis.le
|
private static Logger logger = Logger.getLogger(Serializer.class);
|
40 |
27955
|
claudio.at
|
|
41 |
56504
|
antonis.le
|
private String DELIM;
|
42 |
|
|
private String ENCLOSING;
|
43 |
29375
|
eri.katsar
|
|
44 |
56504
|
antonis.le
|
public Serializer(String DELIM, String ENCLOSING) {
|
45 |
|
|
this.DELIM = DELIM;
|
46 |
|
|
this.ENCLOSING = ENCLOSING;
|
47 |
|
|
}
|
48 |
|
|
|
49 |
|
|
public String serialize(Oaf oaf) {
|
50 |
|
|
|
51 |
41790
|
eri.katsar
|
switch (oaf.getKind()) {
|
52 |
|
|
case entity:
|
53 |
|
|
OafEntity valueEntity = oaf.getEntity();
|
54 |
27955
|
claudio.at
|
|
55 |
41790
|
eri.katsar
|
switch (valueEntity.getType()) {
|
56 |
|
|
case datasource:
|
57 |
27955
|
claudio.at
|
|
58 |
56504
|
antonis.le
|
return buildDatasource(oaf);
|
59 |
27955
|
claudio.at
|
|
60 |
41790
|
eri.katsar
|
case organization:
|
61 |
27955
|
claudio.at
|
|
62 |
56504
|
antonis.le
|
return buildOrganization(oaf);
|
63 |
27955
|
claudio.at
|
|
64 |
41790
|
eri.katsar
|
case project:
|
65 |
36921
|
eri.katsar
|
|
66 |
56504
|
antonis.le
|
return buildProject(oaf);
|
67 |
41790
|
eri.katsar
|
case result:
|
68 |
36921
|
eri.katsar
|
|
69 |
56504
|
antonis.le
|
return buildResult(oaf);
|
70 |
41790
|
eri.katsar
|
default:
|
71 |
|
|
break;
|
72 |
|
|
}
|
73 |
|
|
break;
|
74 |
|
|
case relation:
|
75 |
56504
|
antonis.le
|
return buildRel(oaf.getRel());
|
76 |
56505
|
antonis.le
|
}
|
77 |
27955
|
claudio.at
|
|
78 |
41790
|
eri.katsar
|
return null;
|
79 |
|
|
}
|
80 |
27955
|
claudio.at
|
|
81 |
56504
|
antonis.le
|
public String serialize(OafRel oaf) {
|
82 |
27955
|
claudio.at
|
|
83 |
41790
|
eri.katsar
|
switch (oaf.getRelType()) {
|
84 |
|
|
case resultProject:
|
85 |
56504
|
antonis.le
|
return getResultProject(oaf);
|
86 |
41790
|
eri.katsar
|
default:
|
87 |
56504
|
antonis.le
|
return buildRel(oaf);
|
88 |
41790
|
eri.katsar
|
}
|
89 |
|
|
}
|
90 |
27955
|
claudio.at
|
|
91 |
56504
|
antonis.le
|
private String buildRel(OafRel Rel) {
|
92 |
|
|
return cleanId(Rel.getTarget()) + DELIM;
|
93 |
41790
|
eri.katsar
|
}
|
94 |
27955
|
claudio.at
|
|
95 |
56504
|
antonis.le
|
public void extractRelations(Oaf oaf, Multimap<String, String> relations) {
|
96 |
42734
|
eri.katsar
|
OafEntity valueEntity = oaf.getEntity();
|
97 |
56504
|
antonis.le
|
getOriginalId(valueEntity, relations);
|
98 |
36689
|
eri.katsar
|
|
99 |
41790
|
eri.katsar
|
switch (valueEntity.getType()) {
|
100 |
|
|
case datasource:
|
101 |
56504
|
antonis.le
|
getDatasourceLanguages(valueEntity, relations);
|
102 |
57089
|
antonis.le
|
break;
|
103 |
41790
|
eri.katsar
|
case result:
|
104 |
56504
|
antonis.le
|
getResultTopics(valueEntity, relations);
|
105 |
|
|
getResultLanguages(valueEntity, relations);
|
106 |
|
|
getResultClassifications(valueEntity, relations);
|
107 |
|
|
getResultDatasources(valueEntity, relations);
|
108 |
|
|
getResultConcepts(valueEntity, relations);
|
109 |
|
|
getResultDois(valueEntity, relations);
|
110 |
|
|
getResultCitations(valueEntity, relations);
|
111 |
57089
|
antonis.le
|
break;
|
112 |
42734
|
eri.katsar
|
|
113 |
|
|
case project:
|
114 |
56504
|
antonis.le
|
getProjectKeywords(valueEntity, relations);
|
115 |
|
|
getProjectSubjects(valueEntity, relations);
|
116 |
57089
|
antonis.le
|
break;
|
117 |
42734
|
eri.katsar
|
}
|
118 |
29637
|
eri.katsar
|
|
119 |
42734
|
eri.katsar
|
}
|
120 |
|
|
|
121 |
56504
|
antonis.le
|
private void getOriginalId(OafEntity oafEntity, Multimap<String, String> relations) {
|
122 |
42734
|
eri.katsar
|
|
123 |
|
|
String relName = oafEntity.getType().toString().toLowerCase() + "Oid";
|
124 |
|
|
for (String oid : oafEntity.getOriginalIdList()) {
|
125 |
56504
|
antonis.le
|
relations.put(relName, cleanId(oid));
|
126 |
41790
|
eri.katsar
|
}
|
127 |
29739
|
eri.katsar
|
|
128 |
41790
|
eri.katsar
|
}
|
129 |
27955
|
claudio.at
|
|
130 |
56504
|
antonis.le
|
private void getProjectKeywords(OafEntity oafEntity, Multimap<String, String> relations) {
|
131 |
|
|
relations.put("projectKeyword", getStringField(oafEntity.getProject().getMetadata().getKeywords().getValue()));
|
132 |
29637
|
eri.katsar
|
|
133 |
42734
|
eri.katsar
|
}
|
134 |
27955
|
claudio.at
|
|
135 |
56504
|
antonis.le
|
private void getProjectSubjects(OafEntity oafEntity, Multimap<String, String> relations) {
|
136 |
42734
|
eri.katsar
|
for (StructuredProperty subj : oafEntity.getProject().getMetadata().getSubjectsList()) {
|
137 |
56504
|
antonis.le
|
relations.put("projectSubject", getStringField(subj.getValue()));
|
138 |
42734
|
eri.katsar
|
}
|
139 |
|
|
}
|
140 |
|
|
|
141 |
56504
|
antonis.le
|
private String getResultProject(OafRel oaf) {
|
142 |
42734
|
eri.katsar
|
StringBuilder buff = new StringBuilder();
|
143 |
56508
|
antonis.le
|
buff.append(cleanId(oaf.getTarget())).append(DELIM);
|
144 |
|
|
// is declared as int!!!
|
145 |
41790
|
eri.katsar
|
long diff = DATEDIFF(oaf.getResultProject().getOutcome().getRelMetadata().getEnddate(), oaf.getResultProject().getOutcome().getRelMetadata().getStartdate());
|
146 |
56508
|
antonis.le
|
|
147 |
41790
|
eri.katsar
|
if (diff < 0) {
|
148 |
|
|
diff = 0;
|
149 |
|
|
}
|
150 |
27955
|
claudio.at
|
|
151 |
56504
|
antonis.le
|
buff.append(getNumericField(String.valueOf(diff)));
|
152 |
42734
|
eri.katsar
|
return buff.toString();
|
153 |
41790
|
eri.katsar
|
}
|
154 |
27955
|
claudio.at
|
|
155 |
|
|
|
156 |
56504
|
antonis.le
|
private void getDatasourceLanguages(OafEntity valueEntity, Multimap<String, String> rels) {
|
157 |
41790
|
eri.katsar
|
Datasource d = valueEntity.getDatasource();
|
158 |
|
|
Metadata metadata = d.getMetadata();
|
159 |
27955
|
claudio.at
|
|
160 |
41790
|
eri.katsar
|
for (StringField lang : metadata.getOdlanguagesList()) {
|
161 |
56504
|
antonis.le
|
rels.put("datasourceLanguage", getStringField(lang.getValue()));
|
162 |
41790
|
eri.katsar
|
}
|
163 |
|
|
}
|
164 |
27955
|
claudio.at
|
|
165 |
56504
|
antonis.le
|
private void getResultLanguages(OafEntity valueEntity, Multimap<String, String> rels) {
|
166 |
42734
|
eri.katsar
|
|
167 |
41790
|
eri.katsar
|
Result d = valueEntity.getResult();
|
168 |
42734
|
eri.katsar
|
Result.Metadata metadata = d.getMetadata();
|
169 |
|
|
if (metadata.getLanguage().getClassname() != null && !metadata.getLanguage().getClassname().isEmpty()) {
|
170 |
56504
|
antonis.le
|
rels.put("resultLanguage", getStringField(metadata.getLanguage().getClassname()));
|
171 |
42734
|
eri.katsar
|
}
|
172 |
27955
|
claudio.at
|
|
173 |
42734
|
eri.katsar
|
}
|
174 |
27955
|
claudio.at
|
|
175 |
56504
|
antonis.le
|
private void getResultDois(OafEntity valueEntity, Multimap<String, String> rels) {
|
176 |
27955
|
claudio.at
|
|
177 |
42734
|
eri.katsar
|
for (StructuredProperty pid : valueEntity.getPidList()) {
|
178 |
56504
|
antonis.le
|
rels.put("resultPid", getStringField(pid.getQualifier().getClassname()) + getStringField(pid.getValue()));
|
179 |
41790
|
eri.katsar
|
}
|
180 |
|
|
}
|
181 |
27955
|
claudio.at
|
|
182 |
56504
|
antonis.le
|
private void getResultClassifications(OafEntity valueEntity, Multimap<String, String> rels) {
|
183 |
27955
|
claudio.at
|
|
184 |
41790
|
eri.katsar
|
Result result = valueEntity.getResult();
|
185 |
29735
|
eri.katsar
|
|
186 |
41790
|
eri.katsar
|
for (Instance instance : (result.getInstanceList())) {
|
187 |
|
|
String classification = instance.getInstancetype().getClassname();
|
188 |
56504
|
antonis.le
|
|
189 |
41790
|
eri.katsar
|
if (classification != null && !classification.isEmpty()) {
|
190 |
56504
|
antonis.le
|
rels.put("resultClassification", getStringField(instance.getInstancetype().getClassname()));
|
191 |
41790
|
eri.katsar
|
}
|
192 |
|
|
}
|
193 |
42734
|
eri.katsar
|
}
|
194 |
27955
|
claudio.at
|
|
195 |
56504
|
antonis.le
|
private void getResultConcepts(OafEntity valueEntity, Multimap<String, String> rels) {
|
196 |
41790
|
eri.katsar
|
Result result = valueEntity.getResult();
|
197 |
27955
|
claudio.at
|
|
198 |
42734
|
eri.katsar
|
for (Result.Context context : result.getMetadata().getContextList()) {
|
199 |
56504
|
antonis.le
|
rels.put("resultConcept", cleanId(context.getId()));
|
200 |
41790
|
eri.katsar
|
}
|
201 |
|
|
}
|
202 |
27955
|
claudio.at
|
|
203 |
56504
|
antonis.le
|
private void getResultDatasources(OafEntity valueEntity, Multimap<String, String> rels) {
|
204 |
41790
|
eri.katsar
|
Result result = valueEntity.getResult();
|
205 |
29382
|
eri.katsar
|
|
206 |
56505
|
antonis.le
|
// hosted by
|
207 |
41790
|
eri.katsar
|
for (Instance instance : (result.getInstanceList())) {
|
208 |
|
|
String hostedBy = instance.getHostedby().getKey();
|
209 |
56504
|
antonis.le
|
|
210 |
41790
|
eri.katsar
|
if (hostedBy != null && !hostedBy.isEmpty()) {
|
211 |
56504
|
antonis.le
|
rels.put("resultDatasource", cleanId(hostedBy) + DELIM);
|
212 |
41790
|
eri.katsar
|
}
|
213 |
|
|
}
|
214 |
34084
|
eri.katsar
|
|
215 |
56505
|
antonis.le
|
// collected from
|
216 |
41790
|
eri.katsar
|
for (FieldTypeProtos.KeyValue collectedFromValue : (valueEntity.getCollectedfromList())) {
|
217 |
56504
|
antonis.le
|
String collectedFrom = collectedFromValue.getKey();
|
218 |
34084
|
eri.katsar
|
|
219 |
42734
|
eri.katsar
|
if (collectedFrom != null && !collectedFrom.isEmpty()) {
|
220 |
56504
|
antonis.le
|
rels.put("resultDatasource", cleanId(collectedFrom) + DELIM);
|
221 |
42734
|
eri.katsar
|
}
|
222 |
41790
|
eri.katsar
|
}
|
223 |
|
|
}
|
224 |
29386
|
eri.katsar
|
|
225 |
56504
|
antonis.le
|
private void getResultTopics(OafEntity valueEntity, Multimap<String, String> rels) {
|
226 |
42734
|
eri.katsar
|
Result d = valueEntity.getResult();
|
227 |
|
|
Result.Metadata metadata = d.getMetadata();
|
228 |
|
|
List<StructuredProperty> Topics = metadata.getSubjectList();
|
229 |
29754
|
eri.katsar
|
|
230 |
42734
|
eri.katsar
|
for (StructuredProperty topic : Topics) {
|
231 |
56504
|
antonis.le
|
rels.put("resultTopic", getStringField(topic.getValue()));
|
232 |
41790
|
eri.katsar
|
}
|
233 |
|
|
}
|
234 |
29957
|
eri.katsar
|
|
235 |
27955
|
claudio.at
|
|
236 |
56504
|
antonis.le
|
private void getResultCitations(OafEntity oafEntity, Multimap<String, String> rels) {
|
237 |
42734
|
eri.katsar
|
for (FieldTypeProtos.ExtraInfo extraInfo : oafEntity.getExtraInfoList()) {
|
238 |
|
|
if (extraInfo.getName().equals("result citations")) {
|
239 |
47072
|
tsampikos.
|
DOMParser parser = new DOMParser();
|
240 |
|
|
try {
|
241 |
|
|
parser.parse(new InputSource(new java.io.StringReader(extraInfo.getValue())));
|
242 |
|
|
Document doc = parser.getDocument();
|
243 |
|
|
doc.getDocumentElement().normalize();
|
244 |
|
|
|
245 |
|
|
NodeList citations = doc.getElementsByTagName("citation");
|
246 |
|
|
for (int temp = 0; temp < citations.getLength(); temp++) {
|
247 |
|
|
Element citation = (Element) citations.item(temp);
|
248 |
|
|
NodeList ids = citation.getElementsByTagName("id");
|
249 |
|
|
for(int temp1 = 0; temp1 < ids.getLength(); temp1++){
|
250 |
|
|
Element id = (Element) ids.item(temp1);
|
251 |
|
|
if(id.getAttribute("type").equals("openaire")){
|
252 |
|
|
//System.out.println(id.getAttribute("value"));
|
253 |
|
|
rels.put("resultCitation", id.getAttribute("value"));
|
254 |
|
|
}
|
255 |
|
|
}
|
256 |
|
|
}
|
257 |
|
|
} catch (Exception e) {
|
258 |
56508
|
antonis.le
|
logger.error("Error getting result citations", e);
|
259 |
47072
|
tsampikos.
|
}
|
260 |
41790
|
eri.katsar
|
}
|
261 |
|
|
}
|
262 |
|
|
}
|
263 |
27955
|
claudio.at
|
|
264 |
56504
|
antonis.le
|
private String buildDatasource(Oaf oaf) {
|
265 |
42734
|
eri.katsar
|
Metadata metadata = oaf.getEntity().getDatasource().getMetadata();
|
266 |
|
|
StringBuilder buff = new StringBuilder();
|
267 |
27955
|
claudio.at
|
|
268 |
41790
|
eri.katsar
|
// name
|
269 |
|
|
if (metadata.getOfficialname().getValue().equalsIgnoreCase("unknown")) {
|
270 |
56504
|
antonis.le
|
buff.append(getStringField("Unknown Repository"));
|
271 |
41790
|
eri.katsar
|
} else {
|
272 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getOfficialname().getValue()));
|
273 |
41790
|
eri.katsar
|
}
|
274 |
54431
|
tsampikos.
|
|
275 |
41790
|
eri.katsar
|
// type
|
276 |
42734
|
eri.katsar
|
if (metadata.hasDatasourcetype()) {
|
277 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getDatasourcetype().getClassname().replaceFirst(".*::", "")));
|
278 |
41790
|
eri.katsar
|
}
|
279 |
27955
|
claudio.at
|
|
280 |
41790
|
eri.katsar
|
// compatibility,
|
281 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getOpenairecompatibility().getClassname()));
|
282 |
27955
|
claudio.at
|
|
283 |
55644
|
antonis.le
|
// latitude
|
284 |
56504
|
antonis.le
|
buff.append(getLatLongField(metadata.getLatitude().getValue()));
|
285 |
55644
|
antonis.le
|
|
286 |
|
|
// longtitude
|
287 |
56504
|
antonis.le
|
buff.append(getLatLongField(metadata.getLongitude().getValue()));
|
288 |
55644
|
antonis.le
|
|
289 |
41790
|
eri.katsar
|
// dateofvalidation,
|
290 |
56504
|
antonis.le
|
buff.append(getStringDateField(metadata.getDateofvalidation().getValue()));
|
291 |
27955
|
claudio.at
|
|
292 |
41790
|
eri.katsar
|
// yearofvalidation,
|
293 |
56504
|
antonis.le
|
buff.append(getYearInt(metadata.getDateofvalidation().getValue()));
|
294 |
27955
|
claudio.at
|
|
295 |
54431
|
tsampikos.
|
//harvested
|
296 |
56504
|
antonis.le
|
buff.append(getStringField("false"));
|
297 |
27955
|
claudio.at
|
|
298 |
45523
|
tsampikos.
|
//piwik_id
|
299 |
|
|
String piwik_id = "";
|
300 |
|
|
for (String oid : oaf.getEntity().getOriginalIdList()) {
|
301 |
|
|
if (oid.contains("piwik")) {
|
302 |
|
|
piwik_id = oid.split(":")[1];
|
303 |
|
|
break;
|
304 |
|
|
}
|
305 |
|
|
}
|
306 |
56504
|
antonis.le
|
buff.append(getStringField(cleanNumber(piwik_id)));
|
307 |
45523
|
tsampikos.
|
|
308 |
57521
|
antonis.le
|
buff.append(getStringField(metadata.getWebsiteurl().getValue()));
|
309 |
|
|
|
310 |
42734
|
eri.katsar
|
return buff.toString();
|
311 |
27955
|
claudio.at
|
|
312 |
41790
|
eri.katsar
|
}
|
313 |
27955
|
claudio.at
|
|
314 |
56504
|
antonis.le
|
private String buildOrganization(Oaf oaf) {
|
315 |
27955
|
claudio.at
|
|
316 |
42734
|
eri.katsar
|
StringBuilder buff = new StringBuilder();
|
317 |
|
|
Organization.Metadata metadata = oaf.getEntity().getOrganization().getMetadata();
|
318 |
27955
|
claudio.at
|
|
319 |
41790
|
eri.katsar
|
// `name`,
|
320 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getLegalname().getValue()));
|
321 |
42734
|
eri.katsar
|
|
322 |
41790
|
eri.katsar
|
// `country`,
|
323 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getCountry().getClassid()));
|
324 |
29323
|
eri.katsar
|
|
325 |
42734
|
eri.katsar
|
return buff.toString();
|
326 |
41790
|
eri.katsar
|
}
|
327 |
27955
|
claudio.at
|
|
328 |
56504
|
antonis.le
|
private String buildResult(Oaf oaf) {
|
329 |
42734
|
eri.katsar
|
StringBuilder buff = new StringBuilder();
|
330 |
27955
|
claudio.at
|
|
331 |
42734
|
eri.katsar
|
Result.Metadata metadata = oaf.getEntity().getResult().getMetadata();
|
332 |
27955
|
claudio.at
|
|
333 |
57521
|
antonis.le
|
// originalId
|
334 |
|
|
buff.append(getId(oaf)).append(DELIM);
|
335 |
|
|
|
336 |
56508
|
antonis.le
|
String titleString = "";
|
337 |
56483
|
antonis.le
|
|
338 |
56508
|
antonis.le
|
if (metadata.getTitleList().size() > 0) {
|
339 |
|
|
StructuredProperty title = metadata.getTitleList().get(0);
|
340 |
56483
|
antonis.le
|
|
341 |
56508
|
antonis.le
|
titleString = title.getValue().replaceAll("\\s+", " ");
|
342 |
|
|
titleString = titleString.replaceAll("\n", " ");
|
343 |
56483
|
antonis.le
|
}
|
344 |
|
|
|
345 |
|
|
// pubtitle
|
346 |
56504
|
antonis.le
|
buff.append(getStringField(titleString));
|
347 |
56483
|
antonis.le
|
|
348 |
42734
|
eri.katsar
|
// publisher
|
349 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getPublisher().getValue()));
|
350 |
42734
|
eri.katsar
|
|
351 |
|
|
// journal
|
352 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getJournal().getName())); //#null#!
|
353 |
42734
|
eri.katsar
|
|
354 |
41790
|
eri.katsar
|
// year
|
355 |
56504
|
antonis.le
|
buff.append(getYearInt(metadata.getDateofacceptance().getValue()));
|
356 |
27955
|
claudio.at
|
|
357 |
54431
|
tsampikos.
|
// date
|
358 |
56504
|
antonis.le
|
buff.append(getStringDateField(metadata.getDateofacceptance().getValue()));
|
359 |
29211
|
eri.katsar
|
|
360 |
41790
|
eri.katsar
|
// bestlicense
|
361 |
56504
|
antonis.le
|
buff.append(getStringField(getBestLicense(oaf.getEntity().getResult())));
|
362 |
29735
|
eri.katsar
|
|
363 |
41790
|
eri.katsar
|
// type
|
364 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getResulttype().getClassname()));
|
365 |
42734
|
eri.katsar
|
|
366 |
41790
|
eri.katsar
|
// embargo_end_date
|
367 |
56504
|
antonis.le
|
buff.append(getStringDateField(metadata.getEmbargoenddate().getValue()));
|
368 |
29637
|
eri.katsar
|
|
369 |
41790
|
eri.katsar
|
// `authors`,
|
370 |
54431
|
tsampikos.
|
int authors = metadata.getAuthorCount();
|
371 |
41790
|
eri.katsar
|
String delayed = "no";
|
372 |
27955
|
claudio.at
|
|
373 |
42734
|
eri.katsar
|
for (OafRel rel : oaf.getEntity().getCachedRelList()) {
|
374 |
48302
|
tsampikos.
|
if (rel.getRelType().equals(RelType.resultProject))
|
375 |
56505
|
antonis.le
|
// remember : in result Project, first id is project, second is result.
|
376 |
41790
|
eri.katsar
|
{
|
377 |
42734
|
eri.katsar
|
String daysfromend = getYearDifferenceInteger(rel.getResultProject().getOutcome().getRelMetadata().getEnddate(),
|
378 |
56504
|
antonis.le
|
rel.getResultProject().getOutcome().getRelMetadata().getStartdate());
|
379 |
41790
|
eri.katsar
|
if (Integer.parseInt(daysfromend) > 0) {
|
380 |
|
|
delayed = "yes";
|
381 |
|
|
}
|
382 |
|
|
}
|
383 |
|
|
}
|
384 |
42734
|
eri.katsar
|
|
385 |
41790
|
eri.katsar
|
// `delayed`,
|
386 |
56504
|
antonis.le
|
buff.append(getStringField(delayed));
|
387 |
42734
|
eri.katsar
|
//authors
|
388 |
56504
|
antonis.le
|
buff.append(getNumericField(String.valueOf(authors)));
|
389 |
29336
|
eri.katsar
|
|
390 |
57521
|
antonis.le
|
String authorNames = "";
|
391 |
|
|
for (FieldTypeProtos.Author author:metadata.getAuthorList()) {
|
392 |
|
|
authorNames += author.getFullname() + ";";
|
393 |
|
|
}
|
394 |
|
|
|
395 |
|
|
buff.append(getStringField(authorNames));
|
396 |
|
|
|
397 |
56508
|
antonis.le
|
String sources = "";
|
398 |
56483
|
antonis.le
|
|
399 |
|
|
for (Instance instance : (oaf.getEntity().getResult().getInstanceList())) {
|
400 |
|
|
List<String> urls = instance.getUrlList();
|
401 |
|
|
for (String url : urls) {
|
402 |
56504
|
antonis.le
|
sources += cleanUrl(url) + " ;";
|
403 |
56483
|
antonis.le
|
}
|
404 |
|
|
}
|
405 |
|
|
|
406 |
|
|
//sources
|
407 |
|
|
sources = ENCLOSING + sources + ENCLOSING + DELIM;
|
408 |
|
|
|
409 |
|
|
buff.append(sources);
|
410 |
|
|
|
411 |
57089
|
antonis.le
|
boolean hasAbstract = false;
|
412 |
|
|
for (StringField desc:metadata.getDescriptionList()) {
|
413 |
|
|
if (desc != null && desc.getValue() != null && !desc.getValue().trim().isEmpty())
|
414 |
|
|
hasAbstract = true;
|
415 |
|
|
}
|
416 |
|
|
|
417 |
|
|
buff.append(getStringField(Boolean.toString(hasAbstract)));
|
418 |
|
|
|
419 |
42734
|
eri.katsar
|
return buff.toString();
|
420 |
|
|
|
421 |
41790
|
eri.katsar
|
}
|
422 |
31183
|
eri.katsar
|
|
423 |
56504
|
antonis.le
|
private String getBestLicense(Result result) {
|
424 |
53034
|
tsampikos.
|
Qualifier bestLicense = null;
|
425 |
|
|
LicenseComparator lc = new LicenseComparator();
|
426 |
|
|
for (Instance instance : (result.getInstanceList())) {
|
427 |
50242
|
tsampikos.
|
if (lc.compare(bestLicense, instance.getAccessright()) > 0) {
|
428 |
|
|
bestLicense = instance.getAccessright();
|
429 |
41790
|
eri.katsar
|
}
|
430 |
|
|
}
|
431 |
|
|
if (bestLicense != null) {
|
432 |
|
|
return bestLicense.getClassname();
|
433 |
|
|
} else {
|
434 |
55644
|
antonis.le
|
return "";
|
435 |
41790
|
eri.katsar
|
}
|
436 |
|
|
}
|
437 |
27955
|
claudio.at
|
|
438 |
56504
|
antonis.le
|
private String buildProject(Oaf oaf) {
|
439 |
27955
|
claudio.at
|
|
440 |
56504
|
antonis.le
|
FundingParser fundingParser = new FundingParser(DELIM, ENCLOSING);
|
441 |
42734
|
eri.katsar
|
StringBuilder buff = new StringBuilder();
|
442 |
|
|
Project.Metadata metadata = oaf.getEntity().getProject().getMetadata();
|
443 |
43392
|
tsampikos.
|
|
444 |
41790
|
eri.katsar
|
// `acronym`,
|
445 |
|
|
String acronym = metadata.getAcronym().getValue();
|
446 |
|
|
if (acronym.equalsIgnoreCase("UNKNOWN")) {
|
447 |
|
|
acronym = metadata.getTitle().getValue();
|
448 |
|
|
}
|
449 |
56504
|
antonis.le
|
buff.append(getStringField(acronym));
|
450 |
31183
|
eri.katsar
|
|
451 |
54431
|
tsampikos.
|
//title
|
452 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getTitle().getValue()));
|
453 |
41790
|
eri.katsar
|
|
454 |
54431
|
tsampikos.
|
//funding_lvl
|
455 |
41790
|
eri.katsar
|
List<StringField> fundList = metadata.getFundingtreeList();
|
456 |
|
|
if (!fundList.isEmpty()) // `funding_lvl0`,
|
457 |
|
|
{
|
458 |
56505
|
antonis.le
|
//funder + 3 funding levels
|
459 |
56504
|
antonis.le
|
buff.append(fundingParser.getFundingInfo(fundList.get(0).getValue()));
|
460 |
41790
|
eri.katsar
|
} else {
|
461 |
56504
|
antonis.le
|
buff.append(fundingParser.getFundingInfo(""));
|
462 |
41790
|
eri.katsar
|
}
|
463 |
36995
|
eri.katsar
|
|
464 |
54431
|
tsampikos.
|
//sc39
|
465 |
56508
|
antonis.le
|
String sc39 = metadata.getEcsc39().getValue();
|
466 |
41790
|
eri.katsar
|
if (sc39.equalsIgnoreCase("true") || sc39.equalsIgnoreCase("t") || sc39.contains("yes")) {
|
467 |
|
|
sc39 = "yes";
|
468 |
|
|
} else if (sc39.equalsIgnoreCase("false") || sc39.equalsIgnoreCase("f") || sc39.contains("no")) {
|
469 |
|
|
sc39 = "no";
|
470 |
|
|
}
|
471 |
56504
|
antonis.le
|
buff.append(getStringField(sc39));
|
472 |
27955
|
claudio.at
|
|
473 |
45523
|
tsampikos.
|
//project_type
|
474 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getContracttype().getClassid()));
|
475 |
45523
|
tsampikos.
|
|
476 |
41790
|
eri.katsar
|
// start_year
|
477 |
56504
|
antonis.le
|
buff.append(getYearInt(metadata.getStartdate().getValue()));
|
478 |
27955
|
claudio.at
|
|
479 |
41790
|
eri.katsar
|
// end_year
|
480 |
56504
|
antonis.le
|
buff.append(getYearInt(metadata.getEnddate().getValue()));
|
481 |
27955
|
claudio.at
|
|
482 |
41790
|
eri.katsar
|
// duration enddate-startdate
|
483 |
56504
|
antonis.le
|
buff.append(getYearDifferenceInteger(metadata.getEnddate().getValue(), metadata.getStartdate().getValue()));
|
484 |
27955
|
claudio.at
|
|
485 |
41790
|
eri.katsar
|
// haspubs
|
486 |
56504
|
antonis.le
|
buff.append(getStringField("no"));
|
487 |
27955
|
claudio.at
|
|
488 |
41790
|
eri.katsar
|
// numpubs
|
489 |
56504
|
antonis.le
|
buff.append(getNumericField("0"));
|
490 |
42734
|
eri.katsar
|
|
491 |
41790
|
eri.katsar
|
// enddate
|
492 |
56504
|
antonis.le
|
buff.append(getStringDateField(metadata.getEnddate().getValue()));
|
493 |
42734
|
eri.katsar
|
|
494 |
41790
|
eri.katsar
|
// startdate
|
495 |
56504
|
antonis.le
|
buff.append(getStringDateField(metadata.getStartdate().getValue()));
|
496 |
27955
|
claudio.at
|
|
497 |
41790
|
eri.katsar
|
// `daysforlastpub`,
|
498 |
56504
|
antonis.le
|
buff.append(getNumericField(""));
|
499 |
42734
|
eri.katsar
|
|
500 |
41790
|
eri.katsar
|
// `delayedpubs`,
|
501 |
56504
|
antonis.le
|
buff.append(getNumericField(""));
|
502 |
42734
|
eri.katsar
|
|
503 |
|
|
//call identifier
|
504 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getCallidentifier().getValue()));
|
505 |
54431
|
tsampikos.
|
|
506 |
42734
|
eri.katsar
|
//code
|
507 |
56504
|
antonis.le
|
buff.append(getStringField(metadata.getCode().getValue()));
|
508 |
42734
|
eri.katsar
|
|
509 |
|
|
return buff.toString();
|
510 |
41790
|
eri.katsar
|
}
|
511 |
27955
|
claudio.at
|
|
512 |
|
|
|
513 |
56504
|
antonis.le
|
private String getYearDifferenceInteger(String enddate, String startdate) {
|
514 |
34202
|
eri.katsar
|
|
515 |
41790
|
eri.katsar
|
if (enddate != null && !enddate.isEmpty() && startdate != null && !startdate.isEmpty()) {
|
516 |
29754
|
eri.katsar
|
|
517 |
41790
|
eri.katsar
|
String[] split = startdate.split("-");
|
518 |
27955
|
claudio.at
|
|
519 |
56508
|
antonis.le
|
if (split.length == 0) {
|
520 |
42734
|
eri.katsar
|
return ENCLOSING + "0" + ENCLOSING + DELIM;
|
521 |
41790
|
eri.katsar
|
}
|
522 |
27955
|
claudio.at
|
|
523 |
41790
|
eri.katsar
|
int Startdate = Integer.parseInt(split[0]);
|
524 |
27955
|
claudio.at
|
|
525 |
41790
|
eri.katsar
|
split = enddate.split("-");
|
526 |
27955
|
claudio.at
|
|
527 |
56508
|
antonis.le
|
if (split.length == 0) {
|
528 |
42734
|
eri.katsar
|
return ENCLOSING + "0" + ENCLOSING + DELIM;
|
529 |
41790
|
eri.katsar
|
}
|
530 |
27955
|
claudio.at
|
|
531 |
41790
|
eri.katsar
|
int Enddate = Integer.parseInt(split[0]);
|
532 |
29211
|
eri.katsar
|
|
533 |
41790
|
eri.katsar
|
int diff = Enddate - Startdate;
|
534 |
29211
|
eri.katsar
|
|
535 |
42734
|
eri.katsar
|
return ENCLOSING + diff + ENCLOSING + DELIM;
|
536 |
29384
|
eri.katsar
|
|
537 |
41790
|
eri.katsar
|
}
|
538 |
31279
|
eri.katsar
|
|
539 |
42734
|
eri.katsar
|
return ENCLOSING + "0" + ENCLOSING + DELIM;
|
540 |
41790
|
eri.katsar
|
}
|
541 |
29211
|
eri.katsar
|
|
542 |
56504
|
antonis.le
|
private String getYearInt(String data) {
|
543 |
41790
|
eri.katsar
|
if (data == null || data.isEmpty() || data.equals("-1")) {
|
544 |
42734
|
eri.katsar
|
return ENCLOSING + "0" + ENCLOSING + DELIM;
|
545 |
41790
|
eri.katsar
|
}
|
546 |
27955
|
claudio.at
|
|
547 |
41790
|
eri.katsar
|
String[] split = data.split("-");
|
548 |
29384
|
eri.katsar
|
|
549 |
56508
|
antonis.le
|
if (split.length == 0) {
|
550 |
42734
|
eri.katsar
|
return ENCLOSING + "0" + ENCLOSING + DELIM;
|
551 |
41790
|
eri.katsar
|
}
|
552 |
29336
|
eri.katsar
|
|
553 |
41790
|
eri.katsar
|
String year = split[0];
|
554 |
29336
|
eri.katsar
|
|
555 |
41790
|
eri.katsar
|
year = cleanNumber(year);
|
556 |
27955
|
claudio.at
|
|
557 |
42734
|
eri.katsar
|
if (year == null || year.isEmpty()) year = "0";
|
558 |
30977
|
eri.katsar
|
|
559 |
42734
|
eri.katsar
|
return ENCLOSING + year + ENCLOSING + DELIM;
|
560 |
27955
|
claudio.at
|
|
561 |
41790
|
eri.katsar
|
}
|
562 |
30043
|
eri.katsar
|
|
563 |
56504
|
antonis.le
|
private String cleanNumber(String number) {
|
564 |
56508
|
antonis.le
|
number = number.replaceAll("[^A-Za-z0-9:,_]", "");
|
565 |
41790
|
eri.katsar
|
return number;
|
566 |
|
|
}
|
567 |
30043
|
eri.katsar
|
|
568 |
56504
|
antonis.le
|
private String getLatLongField(String data) {
|
569 |
43739
|
tsampikos.
|
|
570 |
56504
|
antonis.le
|
if (data == null || data.isEmpty())
|
571 |
|
|
return ENCLOSING + "null" + ENCLOSING + DELIM;
|
572 |
43739
|
tsampikos.
|
|
573 |
|
|
return ENCLOSING + data.replaceAll("[^-0-9.]+", "") + ENCLOSING + DELIM;
|
574 |
|
|
|
575 |
|
|
}
|
576 |
|
|
|
577 |
56504
|
antonis.le
|
private String getStringField(String data) {
|
578 |
30977
|
eri.katsar
|
|
579 |
56504
|
antonis.le
|
if (data == null || data.isEmpty())
|
580 |
|
|
return ENCLOSING + "null" + ENCLOSING + DELIM;
|
581 |
27955
|
claudio.at
|
|
582 |
56504
|
antonis.le
|
return ENCLOSING + clean(data) + ENCLOSING + DELIM;
|
583 |
27955
|
claudio.at
|
|
584 |
41790
|
eri.katsar
|
}
|
585 |
27955
|
claudio.at
|
|
586 |
56504
|
antonis.le
|
private String getStringDateField(String data) {
|
587 |
56508
|
antonis.le
|
if (data == null || data.isEmpty() || data.equals("-1")) {
|
588 |
42734
|
eri.katsar
|
return ENCLOSING + "0" + ENCLOSING + DELIM;
|
589 |
41790
|
eri.katsar
|
} else {
|
590 |
42734
|
eri.katsar
|
data = data.replace(DELIM, " ");
|
591 |
|
|
data = data.replace(ENCLOSING, " ");
|
592 |
53034
|
tsampikos.
|
data = data.replaceAll("\\r\\n|\\r|\\n", "");
|
593 |
54431
|
tsampikos.
|
try {
|
594 |
|
|
DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
|
595 |
|
|
data = format.format(format.parse(data));
|
596 |
|
|
return ENCLOSING + data + ENCLOSING + DELIM;
|
597 |
|
|
} catch (ParseException e) {
|
598 |
|
|
return ENCLOSING + "0" + ENCLOSING + DELIM;
|
599 |
|
|
}
|
600 |
41790
|
eri.katsar
|
}
|
601 |
|
|
}
|
602 |
27955
|
claudio.at
|
|
603 |
56504
|
antonis.le
|
private String getNumericField(String data) {
|
604 |
56508
|
antonis.le
|
if (data == null || data.isEmpty()) {
|
605 |
42734
|
eri.katsar
|
return ENCLOSING + "0" + ENCLOSING + DELIM;
|
606 |
41790
|
eri.katsar
|
} else {
|
607 |
42734
|
eri.katsar
|
return ENCLOSING + data + ENCLOSING + DELIM;
|
608 |
41790
|
eri.katsar
|
}
|
609 |
|
|
}
|
610 |
29634
|
eri.katsar
|
|
611 |
56504
|
antonis.le
|
public String getId(Oaf oaf) {
|
612 |
41790
|
eri.katsar
|
switch (oaf.getKind()) {
|
613 |
|
|
case entity:
|
614 |
56504
|
antonis.le
|
return cleanId(oaf.getEntity().getId());
|
615 |
41790
|
eri.katsar
|
case relation:
|
616 |
56504
|
antonis.le
|
return cleanId(oaf.getRel().getSource());
|
617 |
36689
|
eri.katsar
|
|
618 |
41790
|
eri.katsar
|
}
|
619 |
|
|
return null;
|
620 |
27955
|
claudio.at
|
|
621 |
41790
|
eri.katsar
|
}
|
622 |
27955
|
claudio.at
|
|
623 |
56504
|
antonis.le
|
public String getId(OafRel relOaf) {
|
624 |
|
|
return cleanId(relOaf.getSource());
|
625 |
41790
|
eri.katsar
|
}
|
626 |
27955
|
claudio.at
|
|
627 |
56508
|
antonis.le
|
private String clean(String value) {
|
628 |
41790
|
eri.katsar
|
if (value != null) {
|
629 |
36689
|
eri.katsar
|
|
630 |
42734
|
eri.katsar
|
value = value.replaceAll("[\"\\r\\\\;]", "");
|
631 |
|
|
value = value.replace(DELIM, " ");
|
632 |
|
|
value = value.replace(ENCLOSING, " ");
|
633 |
47371
|
tsampikos.
|
value = value.replaceAll("\\r\\n|\\r|\\n", " ");
|
634 |
36689
|
eri.katsar
|
|
635 |
42734
|
eri.katsar
|
return value;
|
636 |
|
|
} else {
|
637 |
|
|
return "";
|
638 |
41790
|
eri.katsar
|
}
|
639 |
27955
|
claudio.at
|
|
640 |
41790
|
eri.katsar
|
}
|
641 |
27955
|
claudio.at
|
|
642 |
56508
|
antonis.le
|
private String cleanId(String value) {
|
643 |
41790
|
eri.katsar
|
if (value != null) {
|
644 |
56505
|
antonis.le
|
// DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( "5|datacite____::" to "datacite____::")
|
645 |
|
|
// AND REPLACES OCCURRENCES OF DELIM CHARS IN DATA
|
646 |
41790
|
eri.katsar
|
value = value.replaceFirst(".*\\|", "");
|
647 |
42734
|
eri.katsar
|
value = value.replace("\n", "");
|
648 |
|
|
value = value.replace(ENCLOSING, "");
|
649 |
|
|
value = value.replace(DELIM, "");
|
650 |
|
|
value = value.replace("\"", "");
|
651 |
|
|
value = value.replace("«", " ");
|
652 |
|
|
value = value.replace("»", " ");
|
653 |
41790
|
eri.katsar
|
}
|
654 |
29336
|
eri.katsar
|
|
655 |
42734
|
eri.katsar
|
return ENCLOSING + value + ENCLOSING;
|
656 |
41790
|
eri.katsar
|
}
|
657 |
31900
|
eri.katsar
|
|
658 |
56508
|
antonis.le
|
private String cleanUrl(String value) {
|
659 |
42734
|
eri.katsar
|
value = value.replace(DELIM, " ");
|
660 |
|
|
value = value.replace(ENCLOSING, " ");
|
661 |
|
|
value = value.replace(" ", "");
|
662 |
|
|
value = value.replace("\n", "");
|
663 |
57089
|
antonis.le
|
value = value.replace("\r", "");
|
664 |
|
|
value = value.replace("\\n", "");
|
665 |
|
|
value = value.replace("\\r", "");
|
666 |
42734
|
eri.katsar
|
return value;
|
667 |
|
|
}
|
668 |
|
|
|
669 |
56508
|
antonis.le
|
private long DATEDIFF(String startDate, String endDate) {
|
670 |
|
|
long MILLISECS_PER_DAY = 24 * 60 * 60 * 1000L;
|
671 |
|
|
long days;
|
672 |
41790
|
eri.katsar
|
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); // "dd/MM/yyyy HH:mm:ss");
|
673 |
|
|
// <startdate>2011-09-01</startdate>
|
674 |
|
|
// <enddate>2015-08-31</enddate>
|
675 |
56508
|
antonis.le
|
Date dateIni;
|
676 |
|
|
Date dateFin;
|
677 |
37693
|
eri.katsar
|
|
678 |
41790
|
eri.katsar
|
if (startDate == null || startDate.isEmpty() || endDate == null || endDate.isEmpty()) {
|
679 |
|
|
return 0;
|
680 |
|
|
}
|
681 |
|
|
try {
|
682 |
56508
|
antonis.le
|
dateIni = format.parse(startDate);
|
683 |
|
|
dateFin = format.parse(endDate);
|
684 |
41790
|
eri.katsar
|
days = (dateFin.getTime() - dateIni.getTime()) / MILLISECS_PER_DAY;
|
685 |
|
|
} catch (Exception e) {
|
686 |
42734
|
eri.katsar
|
|
687 |
41790
|
eri.katsar
|
return 0;
|
688 |
|
|
}
|
689 |
37693
|
eri.katsar
|
|
690 |
41790
|
eri.katsar
|
return days;
|
691 |
|
|
}
|
692 |
27955
|
claudio.at
|
}
|