1
|
package eu.dnetlib.data.oai.store.mongo;
|
2
|
|
3
|
import java.io.ByteArrayInputStream;
|
4
|
import java.io.IOException;
|
5
|
import java.io.StringReader;
|
6
|
import java.time.LocalDateTime;
|
7
|
import java.time.ZoneId;
|
8
|
import java.time.format.DateTimeFormatter;
|
9
|
import java.util.Date;
|
10
|
import java.util.List;
|
11
|
import java.util.zip.ZipEntry;
|
12
|
import java.util.zip.ZipInputStream;
|
13
|
import javax.annotation.Resource;
|
14
|
|
15
|
import com.google.common.collect.Sets;
|
16
|
import com.mongodb.DBObject;
|
17
|
import eu.dnetlib.data.information.oai.publisher.OaiPublisherRuntimeException;
|
18
|
import eu.dnetlib.data.information.oai.publisher.conf.OAIConfigurationReader;
|
19
|
import eu.dnetlib.data.information.oai.publisher.info.RecordInfo;
|
20
|
import org.apache.commons.io.IOUtils;
|
21
|
import org.apache.commons.lang3.StringEscapeUtils;
|
22
|
import org.apache.commons.logging.Log;
|
23
|
import org.apache.commons.logging.LogFactory;
|
24
|
import org.dom4j.Document;
|
25
|
import org.dom4j.DocumentException;
|
26
|
import org.dom4j.io.SAXReader;
|
27
|
|
28
|
/**
|
29
|
* Helper class to generate a RecordInfo object from a Mongo DBObject.
|
30
|
*
|
31
|
* @author alessia
|
32
|
*
|
33
|
*/
|
34
|
public class RecordInfoGenerator {
|
35
|
|
36
|
private static final Log log = LogFactory.getLog(RecordInfoGenerator.class);
|
37
|
@Resource
|
38
|
private MetadataExtractor metadataExtractor;
|
39
|
@Resource
|
40
|
private ProvenanceExtractor provenanceExtractor;
|
41
|
|
42
|
private DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSX");
|
43
|
|
44
|
@SuppressWarnings("unchecked")
|
45
|
public RecordInfo transformDBObject(final DBObject object, final boolean includeBody) {
|
46
|
if ((object == null) || object.keySet().isEmpty()) return null;
|
47
|
String id = (String) object.get(OAIConfigurationReader.ID_FIELD);
|
48
|
// need to escape the identifier, otherwise the XML breaks
|
49
|
id = StringEscapeUtils.escapeXml11(id);
|
50
|
boolean deleted = (Boolean) object.get("deleted");
|
51
|
RecordInfo record = new RecordInfo();
|
52
|
record.setIdentifier(id);
|
53
|
record.setInternalId(object.get("_id").toString());
|
54
|
Object datestampObject = object.get(OAIConfigurationReader.DATESTAMP_FIELD);
|
55
|
if(datestampObject instanceof Date){
|
56
|
record.setDatestamp((Date) datestampObject);
|
57
|
}
|
58
|
else {
|
59
|
//assuming we have a string in the form 2017-12-05T14:24:48.61Z to convert to Date
|
60
|
log.debug("Why the hell there are string instead of Dates in datestamp???");
|
61
|
try {
|
62
|
LocalDateTime d = LocalDateTime.parse(datestampObject.toString(), dtf);
|
63
|
Date utilDate = Date.from(d.atZone(ZoneId.systemDefault()).toInstant());
|
64
|
record.setDatestamp(utilDate);
|
65
|
}catch(Exception dateException){
|
66
|
log.warn("Error setting date from a string datestamp, using current date instead: "+dateException.getMessage());
|
67
|
record.setDatestamp(new Date());
|
68
|
}
|
69
|
}
|
70
|
record.setDeleted(deleted);
|
71
|
List<String> sets = (List<String>) object.get(OAIConfigurationReader.SET_FIELD);
|
72
|
if (sets != null) {
|
73
|
record.setSetspecs(Sets.newHashSet(sets));
|
74
|
}
|
75
|
if (includeBody && !deleted) {
|
76
|
String body = decompressRecord((byte[]) object.get(OAIConfigurationReader.BODY_FIELD));
|
77
|
final SAXReader reader = new SAXReader();
|
78
|
Document doc;
|
79
|
try {
|
80
|
doc = reader.read(new StringReader(body));
|
81
|
record.setMetadata(this.metadataExtractor.evaluate(doc));
|
82
|
record.setProvenance(this.provenanceExtractor.evaluate(doc));
|
83
|
} catch (DocumentException e) {
|
84
|
throw new OaiPublisherRuntimeException(e);
|
85
|
}
|
86
|
}
|
87
|
return record;
|
88
|
|
89
|
}
|
90
|
|
91
|
public String decompressRecord(final byte[] input) {
|
92
|
|
93
|
try {
|
94
|
ByteArrayInputStream bis = new ByteArrayInputStream(input);
|
95
|
ZipInputStream zis = new ZipInputStream(bis);
|
96
|
ZipEntry ze;
|
97
|
ze = zis.getNextEntry();
|
98
|
if (ze== null)
|
99
|
throw new OaiPublisherRuntimeException("cannot decompress null zip entry ");
|
100
|
if (!ze.getName().equals(OAIConfigurationReader.BODY_FIELD))
|
101
|
throw new OaiPublisherRuntimeException("cannot decompress zip entry name :"+ze.getName());
|
102
|
return IOUtils.toString(zis);
|
103
|
} catch (IOException e) {
|
104
|
throw new OaiPublisherRuntimeException(e);
|
105
|
}
|
106
|
|
107
|
|
108
|
}
|
109
|
|
110
|
public MetadataExtractor getMetadataExtractor() {
|
111
|
return metadataExtractor;
|
112
|
}
|
113
|
|
114
|
public void setMetadataExtractor(final MetadataExtractor metadataExtractor) {
|
115
|
this.metadataExtractor = metadataExtractor;
|
116
|
}
|
117
|
|
118
|
public ProvenanceExtractor getProvenanceExtractor() {
|
119
|
return provenanceExtractor;
|
120
|
}
|
121
|
|
122
|
public void setProvenanceExtractor(final ProvenanceExtractor provenanceExtractor) {
|
123
|
this.provenanceExtractor = provenanceExtractor;
|
124
|
}
|
125
|
}
|