1
|
package eu.dnetlib.data.mapreduce.hbase.oai;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.net.UnknownHostException;
|
5
|
import java.time.LocalDateTime;
|
6
|
import java.time.ZoneId;
|
7
|
import java.time.format.DateTimeFormatter;
|
8
|
import java.util.Collection;
|
9
|
import java.util.Date;
|
10
|
import java.util.Map;
|
11
|
import java.util.zip.ZipEntry;
|
12
|
import java.util.zip.ZipOutputStream;
|
13
|
|
14
|
import com.google.common.base.Function;
|
15
|
import com.google.common.collect.Iterables;
|
16
|
import com.google.common.collect.Lists;
|
17
|
import com.google.common.collect.Maps;
|
18
|
import com.google.common.collect.Multimap;
|
19
|
import com.mongodb.BasicDBObject;
|
20
|
import com.mongodb.DBObject;
|
21
|
import com.mongodb.MongoClient;
|
22
|
import com.mongodb.WriteConcern;
|
23
|
import com.mongodb.client.MongoCollection;
|
24
|
import com.mongodb.client.MongoDatabase;
|
25
|
import eu.dnetlib.data.mapreduce.JobParams;
|
26
|
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfiguration;
|
27
|
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfigurationReader;
|
28
|
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfigurationStringReader;
|
29
|
import eu.dnetlib.data.mapreduce.hbase.oai.utils.MongoSetCollection;
|
30
|
import eu.dnetlib.data.mapreduce.hbase.oai.utils.PublisherField;
|
31
|
import eu.dnetlib.data.mapreduce.hbase.oai.utils.RecordFieldsExtractor;
|
32
|
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
|
33
|
import org.apache.commons.io.output.ByteArrayOutputStream;
|
34
|
import org.apache.commons.lang.StringUtils;
|
35
|
import org.apache.hadoop.io.NullWritable;
|
36
|
import org.apache.hadoop.io.Text;
|
37
|
import org.apache.hadoop.mapreduce.Mapper;
|
38
|
import org.bson.types.Binary;
|
39
|
|
40
|
public class OaiFeedMapper extends Mapper<Text, Text, NullWritable, NullWritable> {
|
41
|
|
42
|
enum RecordStatus {
|
43
|
NEW, UPDATED, UNCHANGED;
|
44
|
}
|
45
|
|
46
|
private MongoCollection<DBObject> collection;
|
47
|
private MongoCollection<DBObject> discardedCollection;
|
48
|
private OAIConfigurationStringReader oaiConfigurationReader;
|
49
|
private OAIConfiguration oaiConfiguration;
|
50
|
|
51
|
private Date feedDate;
|
52
|
|
53
|
private MongoSetCollection mongoSetCollection;
|
54
|
|
55
|
private RecordFieldsExtractor extractor;
|
56
|
|
57
|
// these are set in the setup
|
58
|
private String format;
|
59
|
private String interpretation;
|
60
|
private String layout;
|
61
|
private Map<String, PublisherField> fieldsToIndex = Maps.newHashMap();
|
62
|
|
63
|
private String duplicateXPath;
|
64
|
private boolean skipDuplicates;
|
65
|
|
66
|
private MongoClient mongo;
|
67
|
|
68
|
private Collection<String> enrichmentXPaths;
|
69
|
|
70
|
@Override
|
71
|
protected void setup(final Context context) throws UnknownHostException {
|
72
|
|
73
|
String host = context.getConfiguration().get("services.publisher.oai.host");
|
74
|
String port = context.getConfiguration().get("services.publisher.oai.port");
|
75
|
String db = context.getConfiguration().get("services.publisher.oai.db");
|
76
|
String collectionName = context.getConfiguration().get("services.publisher.oai.collection");
|
77
|
|
78
|
System.out.println("Mongodb client params");
|
79
|
System.out.println("host: " + host);
|
80
|
System.out.println("port: " + port);
|
81
|
System.out.println("db: " + db);
|
82
|
System.out.println("collection: " + collectionName);
|
83
|
|
84
|
String[] formatLayoutInterp = collectionName.split("-");
|
85
|
format = formatLayoutInterp[0];
|
86
|
layout = formatLayoutInterp[1];
|
87
|
interpretation = formatLayoutInterp[2];
|
88
|
|
89
|
String oaiConfigurationProfile = context.getConfiguration().get("oaiConfiguration");
|
90
|
System.out.println("oaiConfiguration:\n" + IndentXmlString.apply(oaiConfigurationProfile));
|
91
|
oaiConfigurationReader = new OAIConfigurationStringReader(oaiConfigurationProfile);
|
92
|
oaiConfiguration = oaiConfigurationReader.getOaiConfiguration();
|
93
|
|
94
|
System.out.println("parsed configuration:" + oaiConfiguration.toString());
|
95
|
|
96
|
mongo = new MongoClient(host, Integer.parseInt(port));
|
97
|
MongoDatabase mongoDB = mongo.getDatabase(db);
|
98
|
//DB mongoDB = mongo.getDB(db);
|
99
|
collection = mongoDB.getCollection(collectionName, DBObject.class).withWriteConcern(WriteConcern.UNACKNOWLEDGED);
|
100
|
discardedCollection = mongoDB.getCollection("discarded-" + collectionName, DBObject.class).withWriteConcern(WriteConcern.UNACKNOWLEDGED);
|
101
|
mongoSetCollection = new MongoSetCollection(mongo);
|
102
|
|
103
|
duplicateXPath = context.getConfiguration().get("services.publisher.oai.duplicateXPath");
|
104
|
skipDuplicates = Boolean.parseBoolean(context.getConfiguration().get("services.publisher.oai.skipDuplicates"));
|
105
|
|
106
|
enrichmentXPaths = oaiConfiguration.getEnrichmentXPathsFor(format, layout, interpretation);
|
107
|
Collection<PublisherField> indexFields = oaiConfiguration.getFieldsFor(format, layout, interpretation);
|
108
|
extractor = new RecordFieldsExtractor(Lists.newArrayList(indexFields));
|
109
|
extractor.setDuplicateXPath(duplicateXPath);
|
110
|
extractor.setSkipDuplicates(skipDuplicates);
|
111
|
|
112
|
for (PublisherField field : indexFields) {
|
113
|
fieldsToIndex.put(field.getFieldName(), field);
|
114
|
}
|
115
|
|
116
|
String feedDateString = context.getConfiguration().get(JobParams.OAI_FEED_DATE);
|
117
|
feedDate = parseDate(feedDateString);
|
118
|
}
|
119
|
|
120
|
@Override
|
121
|
protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
|
122
|
String recordKey = key.toString();
|
123
|
String recordBody = value.toString();
|
124
|
if (StringUtils.isBlank(recordBody)) {
|
125
|
discard(context, recordKey, recordBody, "blank body");
|
126
|
} else {
|
127
|
Multimap<String, String> recordFields = extractor.extractFields(recordBody, enrichmentXPaths);
|
128
|
String id;
|
129
|
String oaiID;
|
130
|
if (checkRecordFields(recordFields, context, recordKey, recordBody)) {
|
131
|
id = recordFields.get(OAIConfigurationReader.ID_FIELD).iterator().next();
|
132
|
oaiID = getOAIIdentifier(id);
|
133
|
handleRecord(context, oaiID, recordBody, recordFields);
|
134
|
}
|
135
|
}
|
136
|
}
|
137
|
|
138
|
public boolean checkRecordFields(final Multimap<String, String> recordFields, final Context context, final String recordKey, final String recordBody) {
|
139
|
if ((recordFields == null)) {
|
140
|
context.getCounter("oai", "invalid").increment(1);
|
141
|
return false;
|
142
|
}
|
143
|
if (recordFields.containsEntry("duplicate", "true")) {
|
144
|
if (skipDuplicates) {
|
145
|
context.getCounter("oai", "discardedDuplicate").increment(1);
|
146
|
return false;
|
147
|
} else return true;
|
148
|
}
|
149
|
if (!recordFields.containsKey(OAIConfigurationReader.ID_FIELD)) {
|
150
|
discard(context, recordKey, recordBody, "missing " + OAIConfigurationReader.ID_FIELD);
|
151
|
return false;
|
152
|
}
|
153
|
return true;
|
154
|
}
|
155
|
|
156
|
private void handleRecord(final Context context, final String oaiID, final String record, final Multimap<String, String> recordProperties) {
|
157
|
DBObject obj = this.createBasicObject(oaiID, record, recordProperties,context);
|
158
|
if (obj != null) { // it can be null if the compression did not succeeded: counter is updated in the compress method in that case
|
159
|
//let's use the date of collection and transformation for those records that have them.
|
160
|
//for the rest, we'll keep the feedDate...
|
161
|
Collection<String> collectionDates = recordProperties.get("dateOfCollection");
|
162
|
Collection<String> transDates = recordProperties.get("dateOfTransformation");
|
163
|
Date collDate = feedDate;
|
164
|
if(collectionDates != null && !collectionDates.isEmpty()){
|
165
|
String collDateString = collectionDates.iterator().next();
|
166
|
if(StringUtils.isNotBlank(collDateString))
|
167
|
collDate = parseDate(collDateString);
|
168
|
}
|
169
|
obj.put(OAIConfigurationReader.LAST_COLLECTION_DATE_FIELD, collDate);
|
170
|
Date timestamp = feedDate;
|
171
|
if(transDates != null && !transDates.isEmpty()){
|
172
|
String transDateString = transDates.iterator().next();
|
173
|
if(StringUtils.isNotBlank(transDateString))
|
174
|
timestamp = parseDate(transDateString);
|
175
|
}
|
176
|
obj.put(OAIConfigurationReader.DATESTAMP_FIELD, timestamp);
|
177
|
//the updated_field must stay false, as the oai store is empty.
|
178
|
obj.put(OAIConfigurationReader.UPDATED_FIELD, false);
|
179
|
collection.insertOne(obj);
|
180
|
context.getCounter("oai", "total").increment(1);
|
181
|
}
|
182
|
}
|
183
|
|
184
|
protected Date parseDate(final String date) {
|
185
|
// date should be in the form: 2017-12-18 or 2017-12-18T12:00:04+00:00 or 2014-10-12T15:54:02.43Z or 2014-10-12T15:54:02.432Z or 2014-10-12T15:54:02.4Z
|
186
|
try {
|
187
|
LocalDateTime d = LocalDateTime.parse(date, DateTimeFormatter.ISO_ZONED_DATE_TIME);
|
188
|
return Date.from(d.atZone(ZoneId.systemDefault()).toInstant());
|
189
|
}catch(Exception dateException){
|
190
|
try {
|
191
|
return org.apache.commons.lang.time.DateUtils.parseDate(
|
192
|
date,
|
193
|
new String[] { "yyyy-MM-dd", "yyyy-MM-dd'T'HH:mm:ssXXX", "yyyy-MM-dd'T'HH:mm:ss.SSSX", "yyyy-MM-dd'T'HH:mm:ssZ",
|
194
|
"yyyy-MM-dd'T'HH:mm:ss.SX" });
|
195
|
}catch(Exception dateException2){
|
196
|
dateException2.printStackTrace(System.err);
|
197
|
throw new RuntimeException(dateException2);
|
198
|
}
|
199
|
}
|
200
|
}
|
201
|
|
202
|
private void discard(final Context context, final String recordKey, final String recordBody, final String reason) {
|
203
|
context.getCounter("oai", reason).increment(1);
|
204
|
discardedCollection.insertOne(new BasicDBObject("id", recordKey).append(OAIConfigurationReader.BODY_FIELD, recordBody));
|
205
|
}
|
206
|
|
207
|
private String getOAIIdentifier(final String id) {
|
208
|
return oaiConfiguration.getIdScheme() + ":" + oaiConfiguration.getIdNamespace() + ":" + id;
|
209
|
}
|
210
|
|
211
|
protected DBObject createBasicObject(final String oaiID, final String record, final Multimap<String, String> recordProperties, final Context context) {
|
212
|
DBObject obj = new BasicDBObject();
|
213
|
for (final String key : recordProperties.keySet()) {
|
214
|
if (key.equals(OAIConfigurationReader.ID_FIELD)) {
|
215
|
obj.put(key, oaiID);
|
216
|
} else {
|
217
|
Collection<String> values = recordProperties.get(key);
|
218
|
if (key.equals(OAIConfigurationReader.SET_FIELD)) {
|
219
|
|
220
|
Iterable<String> setSpecs = Iterables.transform(values, new Function<String, String>() {
|
221
|
|
222
|
@Override
|
223
|
public String apply(final String s) {
|
224
|
return mongoSetCollection.normalizeSetSpec(s);
|
225
|
}
|
226
|
|
227
|
});
|
228
|
obj.put(key, setSpecs);
|
229
|
} else {
|
230
|
PublisherField keyField = fieldsToIndex.get(key);
|
231
|
if (keyField == null) {
|
232
|
context.getCounter("oai", key + " found for record but not in configuration. Assuming it is repeatable.").increment(1);
|
233
|
}
|
234
|
// let's check if the key is the name of a repeatable field or not
|
235
|
if ((keyField != null) && !keyField.isRepeatable()) {
|
236
|
if ((values != null) && !values.isEmpty()) {
|
237
|
obj.put(key, values.iterator().next());
|
238
|
}
|
239
|
} else {
|
240
|
obj.put(key, values);
|
241
|
}
|
242
|
}
|
243
|
}
|
244
|
}
|
245
|
|
246
|
Binary compressedRecordBody = createCompressRecord(context, oaiID, record);
|
247
|
if (compressedRecordBody != null) {
|
248
|
obj.put(OAIConfigurationReader.BODY_FIELD, compressedRecordBody);
|
249
|
obj.put(OAIConfigurationReader.DELETED_FIELD, false);
|
250
|
return obj;
|
251
|
} else return null;
|
252
|
}
|
253
|
|
254
|
public Binary createCompressRecord(final Context context, final String recordKey, final String recordBody) {
|
255
|
try {
|
256
|
ByteArrayOutputStream os = new ByteArrayOutputStream();
|
257
|
ZipOutputStream zos = new ZipOutputStream(os);
|
258
|
ZipEntry entry = new ZipEntry(OAIConfigurationReader.BODY_FIELD);
|
259
|
zos.putNextEntry(entry);
|
260
|
zos.write(recordBody.getBytes());
|
261
|
zos.closeEntry();
|
262
|
zos.flush();
|
263
|
zos.close();
|
264
|
return new Binary(os.toByteArray());
|
265
|
} catch (IOException e) {
|
266
|
discard(context, recordKey, recordBody, "cannot compress");
|
267
|
return null;
|
268
|
}
|
269
|
}
|
270
|
|
271
|
@Override
|
272
|
protected void cleanup(final Context context) throws IOException, InterruptedException {
|
273
|
|
274
|
super.cleanup(context);
|
275
|
}
|
276
|
|
277
|
public MongoCollection<DBObject> getCollection() {
|
278
|
return collection;
|
279
|
}
|
280
|
|
281
|
public void setCollection(final MongoCollection<DBObject> collection) {
|
282
|
this.collection = collection;
|
283
|
}
|
284
|
|
285
|
public MongoCollection<DBObject> getDiscardedCollection() {
|
286
|
return discardedCollection;
|
287
|
}
|
288
|
|
289
|
public void setDiscardedCollection(final MongoCollection<DBObject> discardedCollection) {
|
290
|
this.discardedCollection = discardedCollection;
|
291
|
}
|
292
|
|
293
|
public OAIConfigurationStringReader getOaiConfigurationReader() {
|
294
|
return oaiConfigurationReader;
|
295
|
}
|
296
|
|
297
|
public void setOaiConfigurationReader(final OAIConfigurationStringReader oaiConfigurationReader) {
|
298
|
this.oaiConfigurationReader = oaiConfigurationReader;
|
299
|
}
|
300
|
|
301
|
public OAIConfiguration getOaiConfiguration() {
|
302
|
return oaiConfiguration;
|
303
|
}
|
304
|
|
305
|
public void setOaiConfiguration(final OAIConfiguration oaiConfiguration) {
|
306
|
this.oaiConfiguration = oaiConfiguration;
|
307
|
}
|
308
|
|
309
|
public Date getFeedDate() {
|
310
|
return feedDate;
|
311
|
}
|
312
|
|
313
|
public void setFeedDate(final Date feedDate) {
|
314
|
this.feedDate = feedDate;
|
315
|
}
|
316
|
|
317
|
public MongoSetCollection getMongoSetCollection() {
|
318
|
return mongoSetCollection;
|
319
|
}
|
320
|
|
321
|
public void setMongoSetCollection(final MongoSetCollection mongoSetCollection) {
|
322
|
this.mongoSetCollection = mongoSetCollection;
|
323
|
}
|
324
|
|
325
|
public String getDuplicateXPath() {
|
326
|
return duplicateXPath;
|
327
|
}
|
328
|
|
329
|
public void setDuplicateXPath(final String duplicateXPath) {
|
330
|
this.duplicateXPath = duplicateXPath;
|
331
|
}
|
332
|
|
333
|
public boolean isSkipDuplicates() {
|
334
|
return skipDuplicates;
|
335
|
}
|
336
|
|
337
|
public void setSkipDuplicates(final boolean skipDuplicates) {
|
338
|
this.skipDuplicates = skipDuplicates;
|
339
|
}
|
340
|
|
341
|
}
|