Project

General

Profile

1
package eu.dnetlib.iis.importer.converter;
2

    
3
import java.io.IOException;
4
import java.io.StringReader;
5
import java.util.ArrayList;
6
import java.util.Arrays;
7
import java.util.Collection;
8
import java.util.HashSet;
9
import java.util.List;
10
import java.util.Set;
11

    
12
import javax.xml.parsers.DocumentBuilder;
13
import javax.xml.parsers.DocumentBuilderFactory;
14
import javax.xml.parsers.ParserConfigurationException;
15
import javax.xml.xpath.XPath;
16
import javax.xml.xpath.XPathExpressionException;
17
import javax.xml.xpath.XPathFactory;
18

    
19
import org.apache.hadoop.hbase.client.Result;
20
import org.apache.log4j.Logger;
21
import org.json.simple.JSONObject;
22
import org.json.simple.parser.JSONParser;
23
import org.json.simple.parser.ParseException;
24
import org.w3c.dom.Document;
25
import org.xml.sax.InputSource;
26
import org.xml.sax.SAXException;
27

    
28
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
29
import eu.dnetlib.data.proto.OafProtos.Oaf;
30
import eu.dnetlib.iis.importer.input.approver.ResultApprover;
31
import eu.dnetlib.iis.importer.schemas.Project;
32

    
33
/**
34
 * HBase {@link Result} to avro {@link Project} converter.
35
 * @author mhorst
36
 *
37
 */
38
public class ProjectConverter extends AbstractAvroConverter<Project> {
39

    
40
	protected static final Logger log = Logger.getLogger(ProjectConverter.class);
41
	
42
	private static final String ELEM_FUNDING_TREE_PARENT = "parent";
43
	private static final String ELEM_FUNDING_TREE_NAME = "name";
44
	
45
	private static final String FUNDER_FUNDING_SEPARATOR = "::";
46

    
47
	private static final Set<String> ACRONYM_SKIP_LOWERCASED_VALUES = new HashSet<String>(
48
			Arrays.asList("undefined", "unknown"));
49
	
50
	/**
51
	 * Default constructor.
52
	 * @param encoding
53
	 * @param resultApprover
54
	 */
55
	public ProjectConverter(String encoding,
56
			ResultApprover resultApprover) {
57
		super(encoding, resultApprover);
58
	}
59

    
60
	@Override
61
	public Project buildObject(Result source, Oaf resolvedOafObject) throws IOException {
62
		eu.dnetlib.data.proto.ProjectProtos.Project sourceProject = resolvedOafObject.getEntity()!=null?
63
				resolvedOafObject.getEntity().getProject():null;
64
		if (sourceProject==null) {
65
			log.error("skipping: no project object " +
66
					"for a row " + new String(source.getRow(), getEncoding()));
67
			return null;
68
		}
69
		if (resolvedOafObject.getEntity().getId()!=null && 
70
				!resolvedOafObject.getEntity().getId().isEmpty()) {
71
			Project.Builder builder = Project.newBuilder();
72
			builder.setId(resolvedOafObject.getEntity().getId());
73
			if (sourceProject.getMetadata()!=null) {
74
				if (isAcronymValid(sourceProject.getMetadata().getAcronym())) {
75
					builder.setProjectAcronym(sourceProject.getMetadata().getAcronym().getValue());
76
				}
77
				if (sourceProject.getMetadata().getCode()!=null &&
78
						sourceProject.getMetadata().getCode().getValue()!=null &&
79
						!sourceProject.getMetadata().getCode().getValue().isEmpty()) {
80
					builder.setProjectGrantId(sourceProject.getMetadata().getCode().getValue());
81
				}
82
				String extractedFundingClass = extractFundingClass(
83
						extractStringValues(sourceProject.getMetadata().getFundingtreeList()));
84
				if (extractedFundingClass!=null && !extractedFundingClass.isEmpty()) {
85
					builder.setFundingClass(extractedFundingClass);	
86
				}
87
			}
88
			return builder.build();	
89
		} else {
90
			log.warn("unable to extract grant number: " +
91
					"unsupported project id: " + resolvedOafObject.getEntity().getId());
92
			return null;
93
		}
94
	}
95

    
96
	/**
97
	 * Extracts string values from {@link StringField} list.
98
	 * @param source
99
	 * @return string values extracted from {@link StringField} list
100
	 */
101
	protected static List<String> extractStringValues(List<StringField> source) {
102
		if (source!=null) {
103
			List<String> results = new ArrayList<String>(source.size());
104
			for (StringField currentField : source) {
105
				results.add(currentField.getValue());
106
			}
107
			return results;
108
		} else {
109
			return null;
110
		}
111
	}
112
	
113
	/**
114
	 * Verifies whether acronym should be considered as valid.
115
	 * @param acronym
116
	 * @return true if valid, false otherwise
117
	 */
118
	private static boolean isAcronymValid(StringField acronym) {
119
		return acronym!=null && isAcronymValid(acronym.getValue());
120
	}
121
	
122
	/**
123
	 * Verifies whether acronym should be considered as valid.
124
	 * @param acronym
125
	 * @return true if valid, false otherwise
126
	 */
127
	public static boolean isAcronymValid(String acronym) {
128
		return acronym!=null && !acronym.isEmpty() && 
129
				!ACRONYM_SKIP_LOWERCASED_VALUES.contains(acronym.trim().toLowerCase());
130
	}
131
	
132
	/**
133
	 * Extracts funding class from funding tree defined as XML.
134
	 * @param fundingTreeXML
135
	 * @return extracted funding class
136
	 * @throws IOException 
137
	 */
138
	public static String extractFundingClassFromXML(Collection<String> fundingTreeXMLList) throws IOException {
139
		if (fundingTreeXMLList!=null && fundingTreeXMLList.size()>0) {
140
			for (String fundingTreeXML : fundingTreeXMLList) {
141
				if (fundingTreeXML!=null && !fundingTreeXML.isEmpty()) {
142
					DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
143
					try {
144
						DocumentBuilder builder = builderFactory.newDocumentBuilder();
145
						Document xmlDocument = builder.parse(new InputSource(new StringReader(fundingTreeXML)));
146
						XPath xPath =  XPathFactory.newInstance().newXPath();
147
						StringBuilder strBuilder = new StringBuilder();
148
						strBuilder.append(xPath.compile("//funder/shortname").evaluate(xmlDocument));
149
						strBuilder.append(FUNDER_FUNDING_SEPARATOR);
150
						strBuilder.append(xPath.compile("//funding_level_0/name").evaluate(xmlDocument));
151
						return strBuilder.toString();
152
					} catch (ParserConfigurationException e) {
153
					    throw new IOException("exception occurred when processing xml: " + fundingTreeXML, e);
154
					} catch (SAXException e) {
155
						throw new IOException("exception occurred when processing xml: " + fundingTreeXML, e);
156
					} catch (XPathExpressionException e) {
157
						throw new IOException("exception occurred when processing xml: " + fundingTreeXML, e);
158
					}	
159
				}
160
			}
161
		}
162
//		fallback
163
		return null;
164
	}
165
	
166
	/**
167
	 * Extracts funding class from funding tree.
168
	 * @param fundingTreeList
169
	 * @return extracted funding class
170
	 * @throws IOException 
171
	 */
172
	public static String extractFundingClass(List<String> fundingTreeList) throws IOException {
173
		return extractFundingClassFromJSON(fundingTreeList);
174
//		return extractFundingClassFromXML(fundingTreeList);
175
	}
176
	
177
	/**
178
	 * Extracts funding class from funding tree defined as JSON.
179
	 * @param fundingTreeJsonList
180
	 * @return extracted funding class
181
	 * @throws IOException 
182
	 */
183
	public static String extractFundingClassFromJSON(List<String> fundingTreeJsonList) throws IOException {
184
		if (fundingTreeJsonList!=null && !fundingTreeJsonList.isEmpty()) {
185
			for (String currentFundingTreeJson : fundingTreeJsonList) {
186
				if (currentFundingTreeJson!=null && !currentFundingTreeJson.isEmpty()) {
187
					try {
188
						JSONParser parser = new JSONParser();
189
						JSONObject topLevelParent = getTopLevelParent(
190
								(JSONObject) parser.parse(currentFundingTreeJson));
191
						if (topLevelParent!=null) {
192
							@SuppressWarnings("unchecked")
193
							Collection<JSONObject> topLevelParentValues = topLevelParent.values();
194
							for (JSONObject currentValue : topLevelParentValues) {
195
								Object currentName = currentValue.get(ELEM_FUNDING_TREE_NAME);
196
								if (currentName!=null) {
197
									return currentName.toString();
198
								}
199
							}
200
//							fallback
201
							return null;
202
						} else {
203
							return null;
204
						}
205
						
206
					} catch (ParseException e) {
207
						throw new IOException("unable to parse funding tree: " + 
208
					currentFundingTreeJson, e);
209
					}	
210
				}
211
			}		
212
//			fallback
213
			return null;
214
		} else {
215
			return null;
216
		}
217
	}
218
	
219
	private static JSONObject getTopLevelParent(JSONObject parent) {
220
		if (parent!=null) {
221
			@SuppressWarnings("unchecked")
222
			Collection<JSONObject> values = parent.values();
223
			for (JSONObject value : values) {
224
				JSONObject newParent = (JSONObject) value.get(ELEM_FUNDING_TREE_PARENT);
225
				if (newParent!=null && !newParent.isEmpty()) {
226
					return getTopLevelParent(newParent);
227
				}	
228
			}
229
//			fallback
230
			return parent;
231
		} else {
232
			return null;
233
		}
234
		
235
	}
236
	
237
}
(9-9/9)