Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.oai;
2

    
3
import java.io.IOException;
4
import java.net.UnknownHostException;
5
import java.time.LocalDateTime;
6
import java.time.ZoneId;
7
import java.time.format.DateTimeFormatter;
8
import java.util.Collection;
9
import java.util.Date;
10
import java.util.Map;
11
import java.util.zip.ZipEntry;
12
import java.util.zip.ZipOutputStream;
13

    
14
import com.google.common.base.Splitter;
15
import com.google.common.collect.Iterables;
16
import com.google.common.collect.Lists;
17
import com.google.common.collect.Maps;
18
import com.google.common.collect.Multimap;
19
import com.mongodb.BasicDBObject;
20
import com.mongodb.DBObject;
21
import com.mongodb.MongoClient;
22
import com.mongodb.WriteConcern;
23
import com.mongodb.client.MongoCollection;
24
import com.mongodb.client.MongoDatabase;
25
import eu.dnetlib.data.mapreduce.JobParams;
26
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfiguration;
27
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfigurationReader;
28
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfigurationStringReader;
29
import eu.dnetlib.data.mapreduce.hbase.oai.utils.MongoSetCollection;
30
import eu.dnetlib.data.mapreduce.hbase.oai.utils.PublisherField;
31
import eu.dnetlib.data.mapreduce.hbase.oai.utils.RecordFieldsExtractor;
32
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
33
import org.apache.commons.io.output.ByteArrayOutputStream;
34
import org.apache.commons.lang.StringUtils;
35
import org.apache.hadoop.io.NullWritable;
36
import org.apache.hadoop.io.Text;
37
import org.apache.hadoop.mapreduce.Mapper;
38
import org.bson.types.Binary;
39

    
40
public class OaiFeedMapper extends Mapper<Text, Text, NullWritable, NullWritable> {
41

    
42
	enum RecordStatus {
43
		NEW, UPDATED, UNCHANGED;
44
	}
45

    
46
	private MongoCollection<DBObject> collection;
47
	private MongoCollection<DBObject> discardedCollection;
48
	private OAIConfigurationStringReader oaiConfigurationReader;
49
	private OAIConfiguration oaiConfiguration;
50

    
51
	private Date feedDate;
52

    
53
	private MongoSetCollection mongoSetCollection;
54

    
55
	private RecordFieldsExtractor extractor;
56

    
57
	// these are set in the setup
58
	private String format;
59
	private String interpretation;
60
	private String layout;
61
	private Map<String, PublisherField> fieldsToIndex = Maps.newHashMap();
62

    
63
	private String duplicateXPath;
64
	private boolean skipDuplicates;
65

    
66
	private MongoClient mongo;
67

    
68
	private Collection<String> enrichmentXPaths;
69

    
70
	private String[] parseDatePatterns;
71

    
72
	@Override
73
	protected void setup(final Context context) throws UnknownHostException {
74

    
75
		String host = context.getConfiguration().get("services.publisher.oai.host");
76
		String port = context.getConfiguration().get("services.publisher.oai.port");
77
		String db = context.getConfiguration().get("services.publisher.oai.db");
78
		String collectionName = context.getConfiguration().get("services.publisher.oai.collection");
79
		String patterns = context.getConfiguration().get("services.publisher.oai.datepatterns");
80
		this.parseDatePatterns = Lists.newArrayList(Splitter.on(',')
81
				.trimResults()
82
				.omitEmptyStrings().split(patterns)).toArray(new String[0]);
83

    
84
		System.out.println("Mongodb client params");
85
		System.out.println("host: " + host);
86
		System.out.println("port: " + port);
87
		System.out.println("db: " + db);
88
		System.out.println("collection: " + collectionName);
89
		System.out.println("split date patterns: " + patterns);
90

    
91
		String[] formatLayoutInterp = collectionName.split("-");
92
		format = formatLayoutInterp[0];
93
		layout = formatLayoutInterp[1];
94
		interpretation = formatLayoutInterp[2];
95

    
96
		String oaiConfigurationProfile = context.getConfiguration().get("oaiConfiguration");
97
		System.out.println("oaiConfiguration:\n" + IndentXmlString.apply(oaiConfigurationProfile));
98
		oaiConfigurationReader = new OAIConfigurationStringReader(oaiConfigurationProfile);
99
		oaiConfiguration = oaiConfigurationReader.getOaiConfiguration();
100

    
101
		System.out.println("parsed configuration:" + oaiConfiguration.toString());
102

    
103
		mongo = new MongoClient(host, Integer.parseInt(port));
104
		MongoDatabase mongoDB = mongo.getDatabase(db);
105

    
106
		collection = mongoDB.getCollection(collectionName, DBObject.class).withWriteConcern(WriteConcern.UNACKNOWLEDGED);
107
		discardedCollection = mongoDB.getCollection("discarded-" + collectionName, DBObject.class).withWriteConcern(WriteConcern.UNACKNOWLEDGED);
108
		mongoSetCollection = new MongoSetCollection(mongo);
109

    
110
		duplicateXPath = context.getConfiguration().get("services.publisher.oai.duplicateXPath");
111
		skipDuplicates = Boolean.parseBoolean(context.getConfiguration().get("services.publisher.oai.skipDuplicates"));
112

    
113
		enrichmentXPaths = oaiConfiguration.getEnrichmentXPathsFor(format, layout, interpretation);
114
		Collection<PublisherField> indexFields = oaiConfiguration.getFieldsFor(format, layout, interpretation);
115
		extractor = new RecordFieldsExtractor(Lists.newArrayList(indexFields));
116
		extractor.setDuplicateXPath(duplicateXPath);
117
		extractor.setSkipDuplicates(skipDuplicates);
118

    
119
		for (PublisherField field : indexFields) {
120
			fieldsToIndex.put(field.getFieldName(), field);
121
		}
122

    
123
		String feedDateString = context.getConfiguration().get(JobParams.OAI_FEED_DATE);
124
		feedDate = parseDate(feedDateString);
125
	}
126

    
127
	@Override
128
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
129
		String recordKey = key.toString();
130
		String recordBody = value.toString();
131
		if (StringUtils.isBlank(recordBody)) {
132
			discard(context, recordKey, recordBody, "blank body");
133
		} else {
134
			Multimap<String, String> recordFields = extractor.extractFields(recordBody, enrichmentXPaths);
135
			String id;
136
			String oaiID;
137
			if (checkRecordFields(recordFields, context, recordKey, recordBody)) {
138
				id = recordFields.get(OAIConfigurationReader.ID_FIELD).iterator().next();
139
				oaiID = getOAIIdentifier(id);
140
				handleRecord(context, oaiID, recordBody, recordFields);
141
			}
142
		}
143
	}
144

    
145
	public boolean checkRecordFields(final Multimap<String, String> recordFields, final Context context, final String recordKey, final String recordBody) {
146
		if ((recordFields == null)) {
147
			context.getCounter("oai", "invalid").increment(1);
148
			return false;
149
		}
150
		if (recordFields.containsEntry("duplicate", "true")) {
151
			if (skipDuplicates) {
152
				context.getCounter("oai", "discardedDuplicate").increment(1);
153
				return false;
154
			} else return true;
155
		}
156
		if (!recordFields.containsKey(OAIConfigurationReader.ID_FIELD)) {
157
			discard(context, recordKey, recordBody, "missing " + OAIConfigurationReader.ID_FIELD);
158
			return false;
159
		}
160
		return true;
161
	}
162

    
163
	private void handleRecord(final Context context, final String oaiID, final String record, final Multimap<String, String> recordProperties) {
164
		DBObject obj = this.createBasicObject(oaiID, record, recordProperties,context);
165
		if (obj != null) { // it can be null if the compression did not succeeded: counter is updated in the compress method in that case
166
			//let's use the date of collection and transformation for those records that have them.
167
			//for the rest, we'll keep the feedDate...
168
			Collection<String> collectionDates = recordProperties.get("dateOfCollection");
169
			Collection<String> transDates = recordProperties.get("dateOfTransformation");
170
			Date collDate = feedDate;
171
			if(collectionDates != null && !collectionDates.isEmpty()){
172
				String collDateString = collectionDates.iterator().next();
173
				if(StringUtils.isNotBlank(collDateString))
174
					collDate = parseDate(collDateString);
175
			}
176
			else{
177
				context.getCounter("oai", "missing dateOfCollection").increment(1);
178
			}
179
			obj.put(OAIConfigurationReader.LAST_COLLECTION_DATE_FIELD, collDate);
180
			Date timestamp = feedDate;
181
			if(transDates != null && !transDates.isEmpty()){
182
				String transDateString = transDates.iterator().next();
183
				if(StringUtils.isNotBlank(transDateString))
184
					timestamp = parseDate(transDateString);
185
			}
186
			else{
187
				context.getCounter("oai", "missing dateOfTransformation").increment(1);
188
			}
189
			obj.put(OAIConfigurationReader.DATESTAMP_FIELD, timestamp);
190
			//the updated_field must stay false, as the oai store is empty.
191
			obj.put(OAIConfigurationReader.UPDATED_FIELD, false);
192
			collection.insertOne(obj);
193
			context.getCounter("oai", "total").increment(1);
194
		}
195
	}
196

    
197
	protected Date parseDate(final String date) {
198
		// date should be in the form: 2017-12-18 or 2017-12-18T12:00:04+00:00 or 2014-10-12T15:54:02.43Z or 2014-10-12T15:54:02.432Z or 2014-10-12T15:54:02.4Z
199
		try {
200
			LocalDateTime d = LocalDateTime.parse(date, DateTimeFormatter.ISO_ZONED_DATE_TIME);
201
			return Date.from(d.atZone(ZoneId.systemDefault()).toInstant());
202
		}catch(Exception dateException){
203
			try {
204
				return org.apache.commons.lang.time.DateUtils.parseDate(
205
						date,
206
						parseDatePatterns);
207
			}catch(Exception dateException2){
208
				dateException2.printStackTrace(System.err);
209
				throw new RuntimeException(dateException2);
210
			}
211
		}
212
	}
213

    
214
	private void discard(final Context context, final String recordKey, final String recordBody, final String reason) {
215
		context.getCounter("oai", reason).increment(1);
216
		discardedCollection.insertOne(new BasicDBObject("id", recordKey).append(OAIConfigurationReader.BODY_FIELD, recordBody));
217
	}
218

    
219
	private String getOAIIdentifier(final String id) {
220
		return oaiConfiguration.getIdScheme() + ":" + oaiConfiguration.getIdNamespace() + ":" + id;
221
	}
222

    
223
	protected DBObject createBasicObject(final String oaiID, final String record, final Multimap<String, String> recordProperties, final Context context) {
224
		DBObject obj = new BasicDBObject();
225
		for (final String key : recordProperties.keySet()) {
226
			if (key.equals(OAIConfigurationReader.ID_FIELD)) {
227
				obj.put(key, oaiID);
228
			} else {
229
				Collection<String> values = recordProperties.get(key);
230
				if (key.equals(OAIConfigurationReader.SET_FIELD)) {
231

    
232
					Iterable<String> setSpecs = Iterables.transform(values, s -> mongoSetCollection.normalizeSetSpec(s));
233
					obj.put(key, setSpecs);
234
				} else {
235
					PublisherField keyField = fieldsToIndex.get(key);
236
					//Skipping record properties that are not listed as fields to index (e.g. date of transformation and collection)
237
					if (keyField != null) {
238
						// let's check if the key is the name of a repeatable field or not
239
						if ((keyField != null) && !keyField.isRepeatable()) {
240
							if ((values != null) && !values.isEmpty()) {
241
								obj.put(key, values.iterator().next());
242
							}
243
						} else {
244
							obj.put(key, values);
245
						}
246
					}
247
				}
248
			}
249
		}
250

    
251
		Binary compressedRecordBody = createCompressRecord(context, oaiID, record);
252
		if (compressedRecordBody != null) {
253
			obj.put(OAIConfigurationReader.BODY_FIELD, compressedRecordBody);
254
			obj.put(OAIConfigurationReader.DELETED_FIELD, false);
255
			return obj;
256
		} else return null;
257
	}
258

    
259
	public Binary createCompressRecord(final Context context, final String recordKey, final String recordBody) {
260
		try {
261
			ByteArrayOutputStream os = new ByteArrayOutputStream();
262
			ZipOutputStream zos = new ZipOutputStream(os);
263
			ZipEntry entry = new ZipEntry(OAIConfigurationReader.BODY_FIELD);
264
			zos.putNextEntry(entry);
265
			zos.write(recordBody.getBytes());
266
			zos.closeEntry();
267
			zos.flush();
268
			zos.close();
269
			return new Binary(os.toByteArray());
270
		} catch (IOException e) {
271
			discard(context, recordKey, recordBody, "cannot compress");
272
			return null;
273
		}
274
	}
275

    
276
	@Override
277
	protected void cleanup(final Context context) throws IOException, InterruptedException {
278

    
279
		super.cleanup(context);
280
	}
281

    
282
	public MongoCollection<DBObject> getCollection() {
283
		return collection;
284
	}
285

    
286
	public void setCollection(final MongoCollection<DBObject> collection) {
287
		this.collection = collection;
288
	}
289

    
290
	public MongoCollection<DBObject> getDiscardedCollection() {
291
		return discardedCollection;
292
	}
293

    
294
	public void setDiscardedCollection(final MongoCollection<DBObject> discardedCollection) {
295
		this.discardedCollection = discardedCollection;
296
	}
297

    
298
	public OAIConfigurationStringReader getOaiConfigurationReader() {
299
		return oaiConfigurationReader;
300
	}
301

    
302
	public void setOaiConfigurationReader(final OAIConfigurationStringReader oaiConfigurationReader) {
303
		this.oaiConfigurationReader = oaiConfigurationReader;
304
	}
305

    
306
	public OAIConfiguration getOaiConfiguration() {
307
		return oaiConfiguration;
308
	}
309

    
310
	public void setOaiConfiguration(final OAIConfiguration oaiConfiguration) {
311
		this.oaiConfiguration = oaiConfiguration;
312
	}
313

    
314
	public Date getFeedDate() {
315
		return feedDate;
316
	}
317

    
318
	public void setFeedDate(final Date feedDate) {
319
		this.feedDate = feedDate;
320
	}
321

    
322
	public MongoSetCollection getMongoSetCollection() {
323
		return mongoSetCollection;
324
	}
325

    
326
	public void setMongoSetCollection(final MongoSetCollection mongoSetCollection) {
327
		this.mongoSetCollection = mongoSetCollection;
328
	}
329

    
330
	public String getDuplicateXPath() {
331
		return duplicateXPath;
332
	}
333

    
334
	public void setDuplicateXPath(final String duplicateXPath) {
335
		this.duplicateXPath = duplicateXPath;
336
	}
337

    
338
	public boolean isSkipDuplicates() {
339
		return skipDuplicates;
340
	}
341

    
342
	public void setSkipDuplicates(final boolean skipDuplicates) {
343
		this.skipDuplicates = skipDuplicates;
344
	}
345

    
346
	public String[] getParseDatePatterns() {
347
		return parseDatePatterns;
348
	}
349

    
350
	public void setParseDatePatterns(final String[] parseDatePatterns) {
351
		this.parseDatePatterns = parseDatePatterns;
352
	}
353
}
    (1-1/1)