1
|
package eu.dnetlib.iis.importer.converter;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.util.ArrayList;
|
5
|
import java.util.Collection;
|
6
|
import java.util.HashSet;
|
7
|
import java.util.List;
|
8
|
import java.util.Set;
|
9
|
|
10
|
import org.apache.hadoop.hbase.client.Result;
|
11
|
import org.apache.log4j.Logger;
|
12
|
import org.json.simple.JSONObject;
|
13
|
import org.json.simple.parser.JSONParser;
|
14
|
import org.json.simple.parser.ParseException;
|
15
|
|
16
|
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
17
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
18
|
import eu.dnetlib.iis.importer.input.approver.ResultApprover;
|
19
|
import eu.dnetlib.iis.importer.schemas.Project;
|
20
|
|
21
|
/**
|
22
|
* HBase {@link Result} to avro {@link Project} converter.
|
23
|
* @author mhorst
|
24
|
*
|
25
|
*/
|
26
|
public class ProjectConverter extends AbstractAvroConverter<Project> {
|
27
|
|
28
|
protected static final Logger log = Logger.getLogger(ProjectConverter.class);
|
29
|
|
30
|
private static final String ELEM_FUNDING_TREE_PARENT = "parent";
|
31
|
private static final String ELEM_FUNDING_TREE_NAME = "name";
|
32
|
|
33
|
private static final Set<String> ACRONYM_SKIP_LOWERCASED_VALUES = new HashSet<String>();
|
34
|
|
35
|
{
|
36
|
ACRONYM_SKIP_LOWERCASED_VALUES.add("undefined");
|
37
|
ACRONYM_SKIP_LOWERCASED_VALUES.add("unknown");
|
38
|
}
|
39
|
|
40
|
/**
|
41
|
* Default constructor.
|
42
|
* @param encoding
|
43
|
* @param resultApprover
|
44
|
*/
|
45
|
public ProjectConverter(String encoding,
|
46
|
ResultApprover resultApprover) {
|
47
|
super(encoding, resultApprover);
|
48
|
}
|
49
|
|
50
|
@Override
|
51
|
public Project buildObject(Result source, Oaf resolvedOafObject) throws IOException {
|
52
|
eu.dnetlib.data.proto.ProjectProtos.Project sourceProject = resolvedOafObject.getEntity()!=null?
|
53
|
resolvedOafObject.getEntity().getProject():null;
|
54
|
if (sourceProject==null) {
|
55
|
log.error("skipping: no project object " +
|
56
|
"for a row " + new String(source.getRow(), getEncoding()));
|
57
|
return null;
|
58
|
}
|
59
|
if (resolvedOafObject.getEntity().getId()!=null &&
|
60
|
!resolvedOafObject.getEntity().getId().isEmpty()) {
|
61
|
Project.Builder builder = Project.newBuilder();
|
62
|
builder.setId(resolvedOafObject.getEntity().getId());
|
63
|
if (sourceProject.getMetadata()!=null) {
|
64
|
if (isAcronymValid(sourceProject.getMetadata().getAcronym())) {
|
65
|
builder.setProjectAcronym(sourceProject.getMetadata().getAcronym().getValue());
|
66
|
}
|
67
|
if (sourceProject.getMetadata().getCode()!=null &&
|
68
|
sourceProject.getMetadata().getCode().getValue()!=null &&
|
69
|
!sourceProject.getMetadata().getCode().getValue().isEmpty()) {
|
70
|
builder.setProjectGrantId(sourceProject.getMetadata().getCode().getValue());
|
71
|
}
|
72
|
String extractedFundingClass = extractFundingClass(
|
73
|
extractStringValues(sourceProject.getMetadata().getFundingtreeList()));
|
74
|
if (extractedFundingClass!=null && !extractedFundingClass.isEmpty()) {
|
75
|
builder.setFundingClass(extractedFundingClass);
|
76
|
}
|
77
|
}
|
78
|
return builder.build();
|
79
|
} else {
|
80
|
log.warn("unable to extract grant number: " +
|
81
|
"unsupported project id: " + resolvedOafObject.getEntity().getId());
|
82
|
return null;
|
83
|
}
|
84
|
}
|
85
|
|
86
|
/**
|
87
|
* Extracts string values from {@link StringField} list.
|
88
|
* @param source
|
89
|
* @return string values extracted from {@link StringField} list
|
90
|
*/
|
91
|
protected static List<String> extractStringValues(List<StringField> source) {
|
92
|
if (source!=null) {
|
93
|
List<String> results = new ArrayList<String>(source.size());
|
94
|
for (StringField currentField : source) {
|
95
|
results.add(currentField.getValue());
|
96
|
}
|
97
|
return results;
|
98
|
} else {
|
99
|
return null;
|
100
|
}
|
101
|
}
|
102
|
|
103
|
/**
|
104
|
* Verifies whether acronym should be considered as valid.
|
105
|
* @param acronym
|
106
|
* @return true if valid, false otherwise
|
107
|
*/
|
108
|
public static boolean isAcronymValid(StringField acronym) {
|
109
|
return acronym!=null && acronym.getValue()!=null && !acronym.getValue().isEmpty() &&
|
110
|
!ACRONYM_SKIP_LOWERCASED_VALUES.contains(acronym.getValue().trim().toLowerCase());
|
111
|
}
|
112
|
|
113
|
/**
|
114
|
* Extracts funding class from funding tree.
|
115
|
* @param fundingTreeJson
|
116
|
* @return extracted funding class
|
117
|
* @throws IOException
|
118
|
*/
|
119
|
public static String extractFundingClass(List<String> fundingTreeList) throws IOException {
|
120
|
if (fundingTreeList!=null && !fundingTreeList.isEmpty()) {
|
121
|
for (String currentFundingTreeJson : fundingTreeList) {
|
122
|
if (currentFundingTreeJson!=null && !currentFundingTreeJson.isEmpty()) {
|
123
|
try {
|
124
|
JSONParser parser = new JSONParser();
|
125
|
JSONObject topLevelParent = getTopLevelParent(
|
126
|
(JSONObject) parser.parse(currentFundingTreeJson));
|
127
|
if (topLevelParent!=null) {
|
128
|
@SuppressWarnings("unchecked")
|
129
|
Collection<JSONObject> topLevelParentValues = topLevelParent.values();
|
130
|
for (JSONObject currentValue : topLevelParentValues) {
|
131
|
Object currentName = currentValue.get(ELEM_FUNDING_TREE_NAME);
|
132
|
if (currentName!=null) {
|
133
|
return currentName.toString();
|
134
|
}
|
135
|
}
|
136
|
// fallback
|
137
|
return null;
|
138
|
} else {
|
139
|
return null;
|
140
|
}
|
141
|
|
142
|
} catch (ParseException e) {
|
143
|
throw new IOException("unable to parse funding tree: " +
|
144
|
currentFundingTreeJson, e);
|
145
|
}
|
146
|
}
|
147
|
}
|
148
|
// fallback
|
149
|
return null;
|
150
|
} else {
|
151
|
return null;
|
152
|
}
|
153
|
}
|
154
|
|
155
|
private static JSONObject getTopLevelParent(JSONObject parent) {
|
156
|
if (parent!=null) {
|
157
|
@SuppressWarnings("unchecked")
|
158
|
Collection<JSONObject> values = parent.values();
|
159
|
for (JSONObject value : values) {
|
160
|
JSONObject newParent = (JSONObject) value.get(ELEM_FUNDING_TREE_PARENT);
|
161
|
if (newParent!=null && !newParent.isEmpty()) {
|
162
|
return getTopLevelParent(newParent);
|
163
|
}
|
164
|
}
|
165
|
// fallback
|
166
|
return parent;
|
167
|
} else {
|
168
|
return null;
|
169
|
}
|
170
|
|
171
|
}
|
172
|
|
173
|
}
|