Revision 52112
Added by Alessia Bardi about 6 years ago
modules/dnet-mapreduce-jobs/branches/beta/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/utils/RecordFieldsExtractor.java | ||
---|---|---|
63 | 63 |
if (skipDuplicates && isDuplicate(doc)) { |
64 | 64 |
recordProps.put("duplicate", "true"); |
65 | 65 |
} |
66 |
//dates |
|
66 |
//dates: note that this will be used to generate the LAST_COLLECTION_DATE_FIELD and DATESTAMP_FIELD
|
|
67 | 67 |
Node coll = doc.selectSingleNode("//*[local-name()='header']/*[local-name()='dateOfCollection']"); |
68 | 68 |
Node trans = doc.selectSingleNode("//*[local-name()='header']/*[local-name()='dateOfTransformation']"); |
69 | 69 |
if(coll != null && StringUtils.isNotBlank(coll.getText())){ |
modules/dnet-mapreduce-jobs/branches/beta/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/OaiFeedMapper.java | ||
---|---|---|
11 | 11 |
import java.util.zip.ZipEntry; |
12 | 12 |
import java.util.zip.ZipOutputStream; |
13 | 13 |
|
14 |
import com.google.common.base.Function; |
|
15 | 14 |
import com.google.common.base.Splitter; |
16 | 15 |
import com.google.common.collect.Iterables; |
17 | 16 |
import com.google.common.collect.Lists; |
... | ... | |
224 | 223 |
Collection<String> values = recordProperties.get(key); |
225 | 224 |
if (key.equals(OAIConfigurationReader.SET_FIELD)) { |
226 | 225 |
|
227 |
Iterable<String> setSpecs = Iterables.transform(values, new Function<String, String>() { |
|
228 |
|
|
229 |
@Override |
|
230 |
public String apply(final String s) { |
|
231 |
return mongoSetCollection.normalizeSetSpec(s); |
|
232 |
} |
|
233 |
|
|
234 |
}); |
|
226 |
Iterable<String> setSpecs = Iterables.transform(values, s -> mongoSetCollection.normalizeSetSpec(s)); |
|
235 | 227 |
obj.put(key, setSpecs); |
236 | 228 |
} else { |
237 | 229 |
PublisherField keyField = fieldsToIndex.get(key); |
238 |
if (keyField == null) { |
|
239 |
context.getCounter("oai", key + " found for record but not in configuration. Assuming it is repeatable.").increment(1); |
|
240 |
} |
|
241 |
// let's check if the key is the name of a repeatable field or not |
|
242 |
if ((keyField != null) && !keyField.isRepeatable()) { |
|
243 |
if ((values != null) && !values.isEmpty()) { |
|
244 |
obj.put(key, values.iterator().next()); |
|
230 |
//Skipping record properties that are not listed as fields to index (e.g. date of transformation and collection) |
|
231 |
if (keyField != null) { |
|
232 |
// let's check if the key is the name of a repeatable field or not |
|
233 |
if ((keyField != null) && !keyField.isRepeatable()) { |
|
234 |
if ((values != null) && !values.isEmpty()) { |
|
235 |
obj.put(key, values.iterator().next()); |
|
236 |
} |
|
237 |
} else { |
|
238 |
obj.put(key, values); |
|
245 | 239 |
} |
246 |
} else { |
|
247 |
obj.put(key, values); |
|
248 | 240 |
} |
249 | 241 |
} |
250 | 242 |
} |
Also available in: Unified diff
Do not add to the BasicDBObject properties that are not listed as field to index