1
|
package eu.dnetlib.oai.mongo;
|
2
|
|
3
|
import java.io.ByteArrayInputStream;
|
4
|
import java.io.IOException;
|
5
|
import java.io.StringReader;
|
6
|
import java.util.Date;
|
7
|
import java.util.List;
|
8
|
import java.util.zip.ZipEntry;
|
9
|
import java.util.zip.ZipInputStream;
|
10
|
|
11
|
import javax.annotation.Resource;
|
12
|
|
13
|
import org.apache.commons.io.IOUtils;
|
14
|
import org.apache.commons.lang3.StringEscapeUtils;
|
15
|
import org.dom4j.Document;
|
16
|
import org.dom4j.DocumentException;
|
17
|
import org.dom4j.io.SAXReader;
|
18
|
|
19
|
import com.google.common.collect.Sets;
|
20
|
import com.mongodb.DBObject;
|
21
|
|
22
|
import eu.dnetlib.oai.conf.OAIConfigurationReader;
|
23
|
import eu.dnetlib.oai.info.RecordInfo;
|
24
|
import eu.dnetlib.rmi.provision.OaiPublisherRuntimeException;
|
25
|
|
26
|
/**
|
27
|
* Helper class to generate a RecordInfo object from a Mongo DBObject.
|
28
|
*
|
29
|
* @author alessia
|
30
|
*/
|
31
|
public class RecordInfoGenerator {
|
32
|
|
33
|
@Resource
|
34
|
private MetadataExtractor metadataExtractor;
|
35
|
@Resource
|
36
|
private ProvenanceExtractor provenanceExtractor;
|
37
|
|
38
|
@SuppressWarnings("unchecked")
|
39
|
public RecordInfo transformDBObject(final DBObject object, final boolean includeBody) {
|
40
|
if ((object == null) || object.keySet().isEmpty()) { return null; }
|
41
|
String id = (String) object.get(OAIConfigurationReader.ID_FIELD);
|
42
|
// need to escape the identifier, otherwise the XML breaks
|
43
|
id = StringEscapeUtils.escapeXml11(id);
|
44
|
final boolean deleted = (Boolean) object.get("deleted");
|
45
|
final RecordInfo record = new RecordInfo();
|
46
|
record.setIdentifier(id);
|
47
|
record.setInternalId(object.get("_id").toString());
|
48
|
record.setDatestamp((Date) object.get(OAIConfigurationReader.DATESTAMP_FIELD));
|
49
|
record.setDeleted(deleted);
|
50
|
final List<String> sets = (List<String>) object.get(OAIConfigurationReader.SET_FIELD);
|
51
|
if (sets != null) {
|
52
|
record.setSetspecs(Sets.newHashSet(sets));
|
53
|
}
|
54
|
if (includeBody && !deleted) {
|
55
|
final String body = decompressRecord((byte[]) object.get(OAIConfigurationReader.BODY_FIELD));
|
56
|
final SAXReader reader = new SAXReader();
|
57
|
Document doc;
|
58
|
try {
|
59
|
doc = reader.read(new StringReader(body));
|
60
|
record.setMetadata(this.metadataExtractor.apply(doc));
|
61
|
record.setProvenance(this.provenanceExtractor.apply(doc));
|
62
|
} catch (final DocumentException e) {
|
63
|
throw new OaiPublisherRuntimeException(e);
|
64
|
}
|
65
|
}
|
66
|
return record;
|
67
|
|
68
|
}
|
69
|
|
70
|
public String decompressRecord(final byte[] input) {
|
71
|
|
72
|
try {
|
73
|
final ByteArrayInputStream bis = new ByteArrayInputStream(input);
|
74
|
final ZipInputStream zis = new ZipInputStream(bis);
|
75
|
ZipEntry ze;
|
76
|
ze = zis.getNextEntry();
|
77
|
if (ze == null) { throw new OaiPublisherRuntimeException("cannot decompress null zip entry "); }
|
78
|
if (!ze.getName()
|
79
|
.equals(OAIConfigurationReader.BODY_FIELD)) { throw new OaiPublisherRuntimeException("cannot decompress zip entry name :" + ze.getName()); }
|
80
|
return IOUtils.toString(zis);
|
81
|
} catch (final IOException e) {
|
82
|
throw new OaiPublisherRuntimeException(e);
|
83
|
}
|
84
|
|
85
|
}
|
86
|
|
87
|
public MetadataExtractor getMetadataExtractor() {
|
88
|
return this.metadataExtractor;
|
89
|
}
|
90
|
|
91
|
public void setMetadataExtractor(final MetadataExtractor metadataExtractor) {
|
92
|
this.metadataExtractor = metadataExtractor;
|
93
|
}
|
94
|
|
95
|
public ProvenanceExtractor getProvenanceExtractor() {
|
96
|
return this.provenanceExtractor;
|
97
|
}
|
98
|
|
99
|
public void setProvenanceExtractor(final ProvenanceExtractor provenanceExtractor) {
|
100
|
this.provenanceExtractor = provenanceExtractor;
|
101
|
}
|
102
|
}
|