1 |
26600
|
sandro.lab
|
package eu.dnetlib.data.transform;
|
2 |
|
|
|
3 |
|
|
import static org.junit.Assert.assertFalse;
|
4 |
|
|
import static org.junit.Assert.assertNotNull;
|
5 |
|
|
|
6 |
|
|
import java.io.InputStream;
|
7 |
30967
|
claudio.at
|
import java.util.HashMap;
|
8 |
26600
|
sandro.lab
|
import java.util.List;
|
9 |
30967
|
claudio.at
|
import java.util.Map;
|
10 |
|
|
import java.util.Map.Entry;
|
11 |
26600
|
sandro.lab
|
|
12 |
30967
|
claudio.at
|
import javax.xml.transform.TransformerConfigurationException;
|
13 |
|
|
import javax.xml.transform.TransformerFactoryConfigurationError;
|
14 |
|
|
|
15 |
26600
|
sandro.lab
|
import org.apache.commons.io.IOUtils;
|
16 |
30967
|
claudio.at
|
import org.dom4j.DocumentException;
|
17 |
26600
|
sandro.lab
|
import org.junit.Before;
|
18 |
|
|
import org.junit.Test;
|
19 |
|
|
|
20 |
31997
|
claudio.at
|
import com.google.common.collect.Lists;
|
21 |
30967
|
claudio.at
|
import com.google.common.collect.Maps;
|
22 |
26600
|
sandro.lab
|
import com.google.protobuf.InvalidProtocolBufferException;
|
23 |
|
|
|
24 |
|
|
import eu.dnetlib.data.mapreduce.hbase.index.config.ContextMapper;
|
25 |
30967
|
claudio.at
|
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable;
|
26 |
26600
|
sandro.lab
|
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfig;
|
27 |
|
|
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfigTest;
|
28 |
30967
|
claudio.at
|
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor;
|
29 |
28094
|
claudio.at
|
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClasses;
|
30 |
|
|
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClassesTest;
|
31 |
26600
|
sandro.lab
|
import eu.dnetlib.data.mapreduce.util.OafDecoder;
|
32 |
30967
|
claudio.at
|
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
|
33 |
|
|
import eu.dnetlib.data.mapreduce.util.RelDescriptor;
|
34 |
26600
|
sandro.lab
|
import eu.dnetlib.data.mapreduce.util.XmlRecordFactory;
|
35 |
|
|
import eu.dnetlib.data.mapreduce.util.XmlRecordFactoryTest;
|
36 |
|
|
import eu.dnetlib.data.proto.KindProtos.Kind;
|
37 |
|
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
38 |
30967
|
claudio.at
|
import eu.dnetlib.data.proto.OafProtos.OafRel;
|
39 |
|
|
import eu.dnetlib.data.proto.TypeProtos.Type;
|
40 |
26600
|
sandro.lab
|
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
|
41 |
|
|
|
42 |
|
|
public class XsltRowTransformerFactoryTest {
|
43 |
|
|
|
44 |
|
|
private XsltRowTransformerFactory factory;
|
45 |
|
|
|
46 |
30967
|
claudio.at
|
private EntityConfigTable entityConfigTable;
|
47 |
26600
|
sandro.lab
|
|
48 |
31997
|
claudio.at
|
// private final InputStream datacite2insertActionsInputStream =
|
49 |
|
|
// getClass().getResourceAsStream("/eu/dnetlib/actionmanager/xslt/datacite2insertActions.xslt");
|
50 |
30967
|
claudio.at
|
|
51 |
26600
|
sandro.lab
|
private final InputStream dmf2hbaseInputStream = getClass().getResourceAsStream("dmf_2_hbase.xsl");
|
52 |
|
|
|
53 |
|
|
private final InputStream datacite2hbaseInputStream = getClass().getResourceAsStream("datacite_2_hbase.xsl");
|
54 |
|
|
|
55 |
|
|
private final InputStream project2hbaseInputStream = getClass().getResourceAsStream("projects_2_hbase.xsl");
|
56 |
|
|
|
57 |
28308
|
claudio.at
|
private final InputStream recordInputStream = getClass().getResourceAsStream("record.xml");
|
58 |
26600
|
sandro.lab
|
|
59 |
|
|
private final InputStream recordDataciteInputStream = getClass().getResourceAsStream("recordDatacite.xml");
|
60 |
|
|
|
61 |
|
|
private final InputStream projectRecordInputStream = getClass().getResourceAsStream("projectRecord.xml");
|
62 |
|
|
|
63 |
30967
|
claudio.at
|
private final InputStream recordClaimInputStream = getClass().getResourceAsStream("recordClaim.xml");
|
64 |
|
|
|
65 |
31997
|
claudio.at
|
private final InputStream recordPangaeOafInputStream = getClass().getResourceAsStream("pangaeOAF.xml");
|
66 |
|
|
|
67 |
|
|
private final InputStream recordPangaeOdfInputStream = getClass().getResourceAsStream("pangaeODF.xml");
|
68 |
|
|
|
69 |
26600
|
sandro.lab
|
@Before
|
70 |
|
|
public void setUp() throws Exception {
|
71 |
|
|
factory = new XsltRowTransformerFactory();
|
72 |
30967
|
claudio.at
|
entityConfigTable = IndexConfig.load(IndexConfigTest.config).getConfigMap();
|
73 |
26600
|
sandro.lab
|
}
|
74 |
|
|
|
75 |
|
|
@Test
|
76 |
31997
|
claudio.at
|
public void testParseClaim() throws Exception {
|
77 |
26600
|
sandro.lab
|
|
78 |
31997
|
claudio.at
|
doTest(dmf2hbaseInputStream, recordClaimInputStream);
|
79 |
30967
|
claudio.at
|
}
|
80 |
26600
|
sandro.lab
|
|
81 |
30967
|
claudio.at
|
@Test
|
82 |
31997
|
claudio.at
|
public void testParseProjectWithStats() throws Exception {
|
83 |
26600
|
sandro.lab
|
|
84 |
31997
|
claudio.at
|
doTest(project2hbaseInputStream, projectRecordInputStream);
|
85 |
26600
|
sandro.lab
|
}
|
86 |
|
|
|
87 |
|
|
@Test
|
88 |
31997
|
claudio.at
|
public void testParseDmf() throws Exception {
|
89 |
26600
|
sandro.lab
|
|
90 |
31997
|
claudio.at
|
doTest(dmf2hbaseInputStream, recordInputStream);
|
91 |
30967
|
claudio.at
|
}
|
92 |
26600
|
sandro.lab
|
|
93 |
30967
|
claudio.at
|
@Test
|
94 |
31997
|
claudio.at
|
public void testParseDatacite() throws Exception {
|
95 |
26600
|
sandro.lab
|
|
96 |
31997
|
claudio.at
|
doTest(datacite2hbaseInputStream, recordDataciteInputStream);
|
97 |
30967
|
claudio.at
|
}
|
98 |
26600
|
sandro.lab
|
|
99 |
31997
|
claudio.at
|
@Test
|
100 |
|
|
public void testParsePangaeOAF() throws Exception {
|
101 |
26600
|
sandro.lab
|
|
102 |
31997
|
claudio.at
|
doTest(dmf2hbaseInputStream, recordPangaeOafInputStream);
|
103 |
26600
|
sandro.lab
|
}
|
104 |
|
|
|
105 |
31997
|
claudio.at
|
@Test
|
106 |
|
|
public void testParsePangaeODF() throws Exception {
|
107 |
26600
|
sandro.lab
|
|
108 |
31997
|
claudio.at
|
doTest(datacite2hbaseInputStream, recordPangaeOdfInputStream);
|
109 |
|
|
}
|
110 |
26600
|
sandro.lab
|
|
111 |
31997
|
claudio.at
|
@Test
|
112 |
|
|
public void testLinkPangae() throws Exception {
|
113 |
26600
|
sandro.lab
|
|
114 |
31997
|
claudio.at
|
List<Row> rows = Lists.newArrayList();
|
115 |
|
|
rows.addAll(asRows(datacite2hbaseInputStream, recordPangaeOdfInputStream));
|
116 |
|
|
rows.addAll(asRows(dmf2hbaseInputStream, recordPangaeOafInputStream));
|
117 |
|
|
|
118 |
|
|
print(mapAll(buildTable(rows)));
|
119 |
|
|
}
|
120 |
|
|
|
121 |
|
|
private void doTest(final InputStream xsltStream, final InputStream recordStream) throws Exception {
|
122 |
|
|
try {
|
123 |
|
|
List<Row> rows = asRows(xsltStream, recordStream);
|
124 |
|
|
|
125 |
30967
|
claudio.at
|
// System.out.println(rows);
|
126 |
26600
|
sandro.lab
|
|
127 |
31997
|
claudio.at
|
Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
|
128 |
26600
|
sandro.lab
|
|
129 |
30967
|
claudio.at
|
// System.out.println("\n" + table.toString());
|
130 |
26600
|
sandro.lab
|
|
131 |
31997
|
claudio.at
|
Map<String, XmlRecordFactory> builders = mapAll(table);
|
132 |
28094
|
claudio.at
|
|
133 |
31997
|
claudio.at
|
print(builders);
|
134 |
30967
|
claudio.at
|
} catch (InvalidProtocolBufferException e) {
|
135 |
31997
|
claudio.at
|
throw new Exception(e);
|
136 |
30967
|
claudio.at
|
} catch (TransformerConfigurationException e) {
|
137 |
31997
|
claudio.at
|
throw new Exception(e);
|
138 |
30967
|
claudio.at
|
} catch (TransformerFactoryConfigurationError e) {
|
139 |
31997
|
claudio.at
|
throw new Exception(e);
|
140 |
30967
|
claudio.at
|
} catch (DocumentException e) {
|
141 |
31997
|
claudio.at
|
throw new Exception(e);
|
142 |
26600
|
sandro.lab
|
}
|
143 |
|
|
}
|
144 |
|
|
|
145 |
31997
|
claudio.at
|
private List<Row> asRows(final InputStream xsltStream, final InputStream recordStream) throws Exception {
|
146 |
|
|
String xslt = IOUtils.toString(xsltStream);
|
147 |
|
|
XsltRowTransformer transformer = factory.getTransformer(xslt);
|
148 |
|
|
assertNotNull(transformer);
|
149 |
26600
|
sandro.lab
|
|
150 |
31997
|
claudio.at
|
String record = IOUtils.toString(recordStream);
|
151 |
|
|
List<Row> rows = transformer.apply(record);
|
152 |
|
|
|
153 |
|
|
assertNotNull(rows);
|
154 |
|
|
assertFalse(rows.isEmpty());
|
155 |
|
|
return rows;
|
156 |
|
|
}
|
157 |
|
|
|
158 |
|
|
private Map<String, Map<String, Map<String, byte[]>>> buildTable(final List<Row> rows) {
|
159 |
|
|
Map<String, Map<String, Map<String, byte[]>>> table = Maps.newHashMap();
|
160 |
|
|
|
161 |
|
|
for (Row row : rows) {
|
162 |
|
|
String rowKey = row.getKey();
|
163 |
|
|
String cf = row.getColumnFamily();
|
164 |
|
|
if (!table.containsKey(rowKey)) {
|
165 |
|
|
table.put(rowKey, new HashMap<String, Map<String, byte[]>>());
|
166 |
|
|
}
|
167 |
|
|
if (!table.get(rowKey).containsKey(cf)) {
|
168 |
|
|
table.get(rowKey).put(row.getColumnFamily(), new HashMap<String, byte[]>());
|
169 |
|
|
}
|
170 |
|
|
for (Column<String, byte[]> c : row.getColumns()) {
|
171 |
|
|
System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
|
172 |
|
|
table.get(rowKey).get(cf).put(c.getName(), c.getValue());
|
173 |
|
|
}
|
174 |
|
|
}
|
175 |
|
|
return table;
|
176 |
|
|
}
|
177 |
|
|
|
178 |
|
|
private Map<String, XmlRecordFactory> mapAll(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
|
179 |
|
|
|
180 |
|
|
Map<String, XmlRecordFactory> builders = Maps.newHashMap();
|
181 |
|
|
for (Entry<String, Map<String, Map<String, byte[]>>> e : table.entrySet()) {
|
182 |
|
|
map(builders, e.getKey(), e.getValue());
|
183 |
|
|
}
|
184 |
|
|
return builders;
|
185 |
|
|
}
|
186 |
|
|
|
187 |
|
|
private void map(final Map<String, XmlRecordFactory> builders, final String rowKey, final Map<String, Map<String, byte[]>> row) throws Exception {
|
188 |
|
|
|
189 |
30967
|
claudio.at
|
final Type type = OafRowKeyDecoder.decode(rowKey).getType();
|
190 |
26600
|
sandro.lab
|
|
191 |
30967
|
claudio.at
|
Map<String, byte[]> colEntity = row.get(type.toString());
|
192 |
26600
|
sandro.lab
|
|
193 |
30967
|
claudio.at
|
if (colEntity == null) return;
|
194 |
26600
|
sandro.lab
|
|
195 |
30967
|
claudio.at
|
byte[] bodyB = colEntity.get("body");
|
196 |
26600
|
sandro.lab
|
|
197 |
30967
|
claudio.at
|
if (bodyB != null) {
|
198 |
|
|
ensureBuilder(builders, rowKey);
|
199 |
|
|
OafDecoder mainEntity = OafDecoder.decode(Oaf.parseFrom(bodyB));
|
200 |
|
|
builders.get(rowKey).setMainEntity(mainEntity);
|
201 |
28094
|
claudio.at
|
|
202 |
30967
|
claudio.at
|
for (LinkDescriptor ld : entityConfigTable.getDescriptors(type)) {
|
203 |
26600
|
sandro.lab
|
|
204 |
30967
|
claudio.at
|
String it = ld.getRelDescriptor().getIt();
|
205 |
|
|
Map<String, byte[]> cols = row.get(it);
|
206 |
26600
|
sandro.lab
|
|
207 |
30967
|
claudio.at
|
if ((cols != null) && (!cols.isEmpty())) {
|
208 |
26600
|
sandro.lab
|
|
209 |
30967
|
claudio.at
|
for (byte[] oafB : cols.values()) {
|
210 |
|
|
Oaf oaf = Oaf.parseFrom(oafB);
|
211 |
26600
|
sandro.lab
|
|
212 |
30967
|
claudio.at
|
OafRel.Builder relBuilder = OafRel.newBuilder(oaf.getRel());
|
213 |
26600
|
sandro.lab
|
|
214 |
30967
|
claudio.at
|
if (ld.isSymmetric()) {
|
215 |
|
|
RelDescriptor rd = ld.getRelDescriptor();
|
216 |
|
|
relBuilder.setCachedTarget(mainEntity.getEntity()).setRelType(rd.getRelType()).setSubRelType(rd.getSubRelType());
|
217 |
|
|
}
|
218 |
26600
|
sandro.lab
|
|
219 |
30967
|
claudio.at
|
OafRel oafRel = relBuilder.setChild(ld.isChild()).build();
|
220 |
26600
|
sandro.lab
|
|
221 |
30967
|
claudio.at
|
final Oaf.Builder oafBuilder = Oaf.newBuilder().setKind(Kind.relation).setTimestamp(System.currentTimeMillis());
|
222 |
|
|
oafBuilder.getRelBuilder().mergeFrom(oafRel);
|
223 |
26600
|
sandro.lab
|
|
224 |
30967
|
claudio.at
|
String targetId = ld.isSymmetric() ? oafRel.getTarget() : oafRel.getSource();
|
225 |
|
|
ensureBuilder(builders, targetId);
|
226 |
|
|
OafDecoder decoder = OafDecoder.decode(oafBuilder.build());
|
227 |
|
|
if (ld.isChild()) {
|
228 |
|
|
builders.get(targetId).addChild(decoder);
|
229 |
|
|
} else {
|
230 |
|
|
builders.get(targetId).addRelation(decoder);
|
231 |
|
|
}
|
232 |
26600
|
sandro.lab
|
|
233 |
30967
|
claudio.at
|
}
|
234 |
|
|
}
|
235 |
|
|
}
|
236 |
26600
|
sandro.lab
|
}
|
237 |
|
|
|
238 |
|
|
}
|
239 |
|
|
|
240 |
31997
|
claudio.at
|
private void print(final Map<String, XmlRecordFactory> builders) {
|
241 |
|
|
for (Entry<String, XmlRecordFactory> e : builders.entrySet()) {
|
242 |
|
|
if (e.getValue().isValid()) {
|
243 |
|
|
System.out.println(IndentXmlString.apply(e.getValue().build()));
|
244 |
|
|
} else {
|
245 |
|
|
System.out.println("invalid builder: " + e.getKey());
|
246 |
|
|
}
|
247 |
|
|
}
|
248 |
|
|
}
|
249 |
|
|
|
250 |
|
|
private void ensureBuilder(final Map<String, XmlRecordFactory> builders, final String rowKey) throws Exception {
|
251 |
30967
|
claudio.at
|
if (!builders.containsKey(rowKey)) {
|
252 |
|
|
builders.put(rowKey, newBuilder());
|
253 |
26600
|
sandro.lab
|
}
|
254 |
|
|
}
|
255 |
|
|
|
256 |
30967
|
claudio.at
|
private XmlRecordFactory newBuilder() throws TransformerConfigurationException, TransformerFactoryConfigurationError, DocumentException {
|
257 |
|
|
return new XmlRecordFactory(entityConfigTable, ContextMapper.fromXml(eu.dnetlib.data.mapreduce.hbase.index.config.Context.xml),
|
258 |
|
|
RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false);
|
259 |
|
|
}
|
260 |
|
|
|
261 |
|
|
@Test
|
262 |
31997
|
claudio.at
|
public void test_template() throws Exception {
|
263 |
26600
|
sandro.lab
|
String xslt = IOUtils.toString(dmf2hbaseInputStream);
|
264 |
|
|
XsltRowTransformer transformer = factory.getTransformer(xslt);
|
265 |
|
|
assertNotNull(transformer);
|
266 |
|
|
|
267 |
|
|
String record = IOUtils.toString(recordInputStream);
|
268 |
|
|
List<Row> rows = transformer.apply(record);
|
269 |
|
|
|
270 |
|
|
System.out.println(rows);
|
271 |
|
|
}
|
272 |
|
|
|
273 |
|
|
}
|