Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.transform;
2
3
import static org.junit.Assert.assertFalse;
4
import static org.junit.Assert.assertNotNull;
5
6
import java.io.InputStream;
7 30967 claudio.at
import java.util.HashMap;
8 26600 sandro.lab
import java.util.List;
9 30967 claudio.at
import java.util.Map;
10
import java.util.Map.Entry;
11 26600 sandro.lab
12 30967 claudio.at
import javax.xml.transform.TransformerConfigurationException;
13
import javax.xml.transform.TransformerFactoryConfigurationError;
14
15 26600 sandro.lab
import org.apache.commons.io.IOUtils;
16 30967 claudio.at
import org.dom4j.DocumentException;
17 26600 sandro.lab
import org.junit.Before;
18
import org.junit.Test;
19
20 31997 claudio.at
import com.google.common.collect.Lists;
21 30967 claudio.at
import com.google.common.collect.Maps;
22 26600 sandro.lab
import com.google.protobuf.InvalidProtocolBufferException;
23
24
import eu.dnetlib.data.mapreduce.hbase.index.config.ContextMapper;
25 30967 claudio.at
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable;
26 26600 sandro.lab
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfig;
27
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfigTest;
28 30967 claudio.at
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor;
29 28094 claudio.at
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClasses;
30
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClassesTest;
31 26600 sandro.lab
import eu.dnetlib.data.mapreduce.util.OafDecoder;
32 30967 claudio.at
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
33
import eu.dnetlib.data.mapreduce.util.RelDescriptor;
34 26600 sandro.lab
import eu.dnetlib.data.mapreduce.util.XmlRecordFactory;
35
import eu.dnetlib.data.mapreduce.util.XmlRecordFactoryTest;
36
import eu.dnetlib.data.proto.KindProtos.Kind;
37
import eu.dnetlib.data.proto.OafProtos.Oaf;
38 30967 claudio.at
import eu.dnetlib.data.proto.OafProtos.OafRel;
39
import eu.dnetlib.data.proto.TypeProtos.Type;
40 26600 sandro.lab
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
41
42
public class XsltRowTransformerFactoryTest {
43
44
	private XsltRowTransformerFactory factory;
45
46 30967 claudio.at
	private EntityConfigTable entityConfigTable;
47 26600 sandro.lab
48 31997 claudio.at
	// private final InputStream datacite2insertActionsInputStream =
49
	// getClass().getResourceAsStream("/eu/dnetlib/actionmanager/xslt/datacite2insertActions.xslt");
50 30967 claudio.at
51 26600 sandro.lab
	private final InputStream dmf2hbaseInputStream = getClass().getResourceAsStream("dmf_2_hbase.xsl");
52
53
	private final InputStream datacite2hbaseInputStream = getClass().getResourceAsStream("datacite_2_hbase.xsl");
54
55
	private final InputStream project2hbaseInputStream = getClass().getResourceAsStream("projects_2_hbase.xsl");
56
57 28308 claudio.at
	private final InputStream recordInputStream = getClass().getResourceAsStream("record.xml");
58 26600 sandro.lab
59
	private final InputStream recordDataciteInputStream = getClass().getResourceAsStream("recordDatacite.xml");
60
61
	private final InputStream projectRecordInputStream = getClass().getResourceAsStream("projectRecord.xml");
62
63 30967 claudio.at
	private final InputStream recordClaimInputStream = getClass().getResourceAsStream("recordClaim.xml");
64
65 31997 claudio.at
	private final InputStream recordPangaeOafInputStream = getClass().getResourceAsStream("pangaeOAF.xml");
66
67
	private final InputStream recordPangaeOdfInputStream = getClass().getResourceAsStream("pangaeODF.xml");
68
69 26600 sandro.lab
	@Before
70
	public void setUp() throws Exception {
71
		factory = new XsltRowTransformerFactory();
72 30967 claudio.at
		entityConfigTable = IndexConfig.load(IndexConfigTest.config).getConfigMap();
73 26600 sandro.lab
	}
74
75
	@Test
76 31997 claudio.at
	public void testParseClaim() throws Exception {
77 26600 sandro.lab
78 31997 claudio.at
		doTest(dmf2hbaseInputStream, recordClaimInputStream);
79 30967 claudio.at
	}
80 26600 sandro.lab
81 30967 claudio.at
	@Test
82 31997 claudio.at
	public void testParseProjectWithStats() throws Exception {
83 26600 sandro.lab
84 31997 claudio.at
		doTest(project2hbaseInputStream, projectRecordInputStream);
85 26600 sandro.lab
	}
86
87
	@Test
88 31997 claudio.at
	public void testParseDmf() throws Exception {
89 26600 sandro.lab
90 31997 claudio.at
		doTest(dmf2hbaseInputStream, recordInputStream);
91 30967 claudio.at
	}
92 26600 sandro.lab
93 30967 claudio.at
	@Test
94 31997 claudio.at
	public void testParseDatacite() throws Exception {
95 26600 sandro.lab
96 31997 claudio.at
		doTest(datacite2hbaseInputStream, recordDataciteInputStream);
97 30967 claudio.at
	}
98 26600 sandro.lab
99 31997 claudio.at
	@Test
100
	public void testParsePangaeOAF() throws Exception {
101 26600 sandro.lab
102 31997 claudio.at
		doTest(dmf2hbaseInputStream, recordPangaeOafInputStream);
103 26600 sandro.lab
	}
104
105 31997 claudio.at
	@Test
106
	public void testParsePangaeODF() throws Exception {
107 26600 sandro.lab
108 31997 claudio.at
		doTest(datacite2hbaseInputStream, recordPangaeOdfInputStream);
109
	}
110 26600 sandro.lab
111 31997 claudio.at
	@Test
112
	public void testLinkPangae() throws Exception {
113 26600 sandro.lab
114 31997 claudio.at
		List<Row> rows = Lists.newArrayList();
115
		rows.addAll(asRows(datacite2hbaseInputStream, recordPangaeOdfInputStream));
116
		rows.addAll(asRows(dmf2hbaseInputStream, recordPangaeOafInputStream));
117
118
		print(mapAll(buildTable(rows)));
119
	}
120
121
	private void doTest(final InputStream xsltStream, final InputStream recordStream) throws Exception {
122
		try {
123
			List<Row> rows = asRows(xsltStream, recordStream);
124
125 30967 claudio.at
			// System.out.println(rows);
126 26600 sandro.lab
127 31997 claudio.at
			Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
128 26600 sandro.lab
129 30967 claudio.at
			// System.out.println("\n" + table.toString());
130 26600 sandro.lab
131 31997 claudio.at
			Map<String, XmlRecordFactory> builders = mapAll(table);
132 28094 claudio.at
133 31997 claudio.at
			print(builders);
134 30967 claudio.at
		} catch (InvalidProtocolBufferException e) {
135 31997 claudio.at
			throw new Exception(e);
136 30967 claudio.at
		} catch (TransformerConfigurationException e) {
137 31997 claudio.at
			throw new Exception(e);
138 30967 claudio.at
		} catch (TransformerFactoryConfigurationError e) {
139 31997 claudio.at
			throw new Exception(e);
140 30967 claudio.at
		} catch (DocumentException e) {
141 31997 claudio.at
			throw new Exception(e);
142 26600 sandro.lab
		}
143
	}
144
145 31997 claudio.at
	private List<Row> asRows(final InputStream xsltStream, final InputStream recordStream) throws Exception {
146
		String xslt = IOUtils.toString(xsltStream);
147
		XsltRowTransformer transformer = factory.getTransformer(xslt);
148
		assertNotNull(transformer);
149 26600 sandro.lab
150 31997 claudio.at
		String record = IOUtils.toString(recordStream);
151
		List<Row> rows = transformer.apply(record);
152
153
		assertNotNull(rows);
154
		assertFalse(rows.isEmpty());
155
		return rows;
156
	}
157
158
	private Map<String, Map<String, Map<String, byte[]>>> buildTable(final List<Row> rows) {
159
		Map<String, Map<String, Map<String, byte[]>>> table = Maps.newHashMap();
160
161
		for (Row row : rows) {
162
			String rowKey = row.getKey();
163
			String cf = row.getColumnFamily();
164
			if (!table.containsKey(rowKey)) {
165
				table.put(rowKey, new HashMap<String, Map<String, byte[]>>());
166
			}
167
			if (!table.get(rowKey).containsKey(cf)) {
168
				table.get(rowKey).put(row.getColumnFamily(), new HashMap<String, byte[]>());
169
			}
170
			for (Column<String, byte[]> c : row.getColumns()) {
171
				System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
172
				table.get(rowKey).get(cf).put(c.getName(), c.getValue());
173
			}
174
		}
175
		return table;
176
	}
177
178
	private Map<String, XmlRecordFactory> mapAll(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
179
180
		Map<String, XmlRecordFactory> builders = Maps.newHashMap();
181
		for (Entry<String, Map<String, Map<String, byte[]>>> e : table.entrySet()) {
182
			map(builders, e.getKey(), e.getValue());
183
		}
184
		return builders;
185
	}
186
187
	private void map(final Map<String, XmlRecordFactory> builders, final String rowKey, final Map<String, Map<String, byte[]>> row) throws Exception {
188
189 30967 claudio.at
		final Type type = OafRowKeyDecoder.decode(rowKey).getType();
190 26600 sandro.lab
191 30967 claudio.at
		Map<String, byte[]> colEntity = row.get(type.toString());
192 26600 sandro.lab
193 30967 claudio.at
		if (colEntity == null) return;
194 26600 sandro.lab
195 30967 claudio.at
		byte[] bodyB = colEntity.get("body");
196 26600 sandro.lab
197 30967 claudio.at
		if (bodyB != null) {
198
			ensureBuilder(builders, rowKey);
199
			OafDecoder mainEntity = OafDecoder.decode(Oaf.parseFrom(bodyB));
200
			builders.get(rowKey).setMainEntity(mainEntity);
201 28094 claudio.at
202 30967 claudio.at
			for (LinkDescriptor ld : entityConfigTable.getDescriptors(type)) {
203 26600 sandro.lab
204 30967 claudio.at
				String it = ld.getRelDescriptor().getIt();
205
				Map<String, byte[]> cols = row.get(it);
206 26600 sandro.lab
207 30967 claudio.at
				if ((cols != null) && (!cols.isEmpty())) {
208 26600 sandro.lab
209 30967 claudio.at
					for (byte[] oafB : cols.values()) {
210
						Oaf oaf = Oaf.parseFrom(oafB);
211 26600 sandro.lab
212 30967 claudio.at
						OafRel.Builder relBuilder = OafRel.newBuilder(oaf.getRel());
213 26600 sandro.lab
214 30967 claudio.at
						if (ld.isSymmetric()) {
215
							RelDescriptor rd = ld.getRelDescriptor();
216
							relBuilder.setCachedTarget(mainEntity.getEntity()).setRelType(rd.getRelType()).setSubRelType(rd.getSubRelType());
217
						}
218 26600 sandro.lab
219 30967 claudio.at
						OafRel oafRel = relBuilder.setChild(ld.isChild()).build();
220 26600 sandro.lab
221 30967 claudio.at
						final Oaf.Builder oafBuilder = Oaf.newBuilder().setKind(Kind.relation).setTimestamp(System.currentTimeMillis());
222
						oafBuilder.getRelBuilder().mergeFrom(oafRel);
223 26600 sandro.lab
224 30967 claudio.at
						String targetId = ld.isSymmetric() ? oafRel.getTarget() : oafRel.getSource();
225
						ensureBuilder(builders, targetId);
226
						OafDecoder decoder = OafDecoder.decode(oafBuilder.build());
227
						if (ld.isChild()) {
228
							builders.get(targetId).addChild(decoder);
229
						} else {
230
							builders.get(targetId).addRelation(decoder);
231
						}
232 26600 sandro.lab
233 30967 claudio.at
					}
234
				}
235
			}
236 26600 sandro.lab
		}
237
238
	}
239
240 31997 claudio.at
	private void print(final Map<String, XmlRecordFactory> builders) {
241
		for (Entry<String, XmlRecordFactory> e : builders.entrySet()) {
242
			if (e.getValue().isValid()) {
243
				System.out.println(IndentXmlString.apply(e.getValue().build()));
244
			} else {
245
				System.out.println("invalid builder: " + e.getKey());
246
			}
247
		}
248
	}
249
250
	private void ensureBuilder(final Map<String, XmlRecordFactory> builders, final String rowKey) throws Exception {
251 30967 claudio.at
		if (!builders.containsKey(rowKey)) {
252
			builders.put(rowKey, newBuilder());
253 26600 sandro.lab
		}
254
	}
255
256 30967 claudio.at
	private XmlRecordFactory newBuilder() throws TransformerConfigurationException, TransformerFactoryConfigurationError, DocumentException {
257
		return new XmlRecordFactory(entityConfigTable, ContextMapper.fromXml(eu.dnetlib.data.mapreduce.hbase.index.config.Context.xml),
258
				RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false);
259
	}
260
261
	@Test
262 31997 claudio.at
	public void test_template() throws Exception {
263 26600 sandro.lab
		String xslt = IOUtils.toString(dmf2hbaseInputStream);
264
		XsltRowTransformer transformer = factory.getTransformer(xslt);
265
		assertNotNull(transformer);
266
267
		String record = IOUtils.toString(recordInputStream);
268
		List<Row> rows = transformer.apply(record);
269
270
		System.out.println(rows);
271
	}
272
273
}