Project

General

Profile

« Previous | Next » 

Revision 54832

update for zenodo community bulk tagging

View differences:

modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/bulktag/BulkTaggingMapper.java
50 50

  
51 51
		if (body != null) {
52 52
			context.getCounter("Bulk Tagging", "not null body ").increment(1);
53

  
53
			final Oaf tobeenriched = Oaf.parseFrom(body);
54
//			if (!tobeenriched.getEntity().getId().equals("50|od______2659::005d5da2dc2176c5156a5390db89f09e"))
55
//				return;
56
//			System.out.println("tobeenriched.getEntity().getResult().getMetadata().toString() = " + tobeenriched.getEntity().getResult().getMetadata().toString());
54 57
			final Oaf oaf = tagger.enrichContext(Oaf.parseFrom(body), cc, context);
55 58
			if (oaf == null) {
56 59
				//context.getCounter("In mapper", " null oaf ").increment(1);
......
64 67
					.count();
65 68
			context.getCounter("Bulk Tagging", " bulktagged ").increment(tagged);
66 69

  
70
//			System.out.println("oaf.getEntity().getResult().getMetadata().toString() = " + oaf.getEntity().getResult().getMetadata().toString());
71
				final Put put = new Put(key.copyBytes()).add(Bytes.toBytes("result"), Bytes.toBytes("body"), oaf.toByteArray());
67 72

  
68
			final Put put = new Put(key.copyBytes()).add(Bytes.toBytes("result"), Bytes.toBytes("body"), oaf.toByteArray());
69

  
70
			if(tagged > 0){
73
			if(!tobeenriched.getEntity().getResult().getMetadata().toString().equals(oaf.getEntity().getResult().getMetadata().toString())){
71 74
				if (enabled)
72 75
					context.write(key, put);
73 76
				context.getCounter("Bulk Tagging", " write op ").increment(1);
74 77
			}
75

  
78
			//throw new RuntimeException();
76 79
		}
77 80
		else{
78 81
			context.getCounter("Bulk Tagging", " null body ").increment(1);
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/bulktag/ResultTagger.java
8 8
import eu.dnetlib.data.proto.FieldTypeProtos;
9 9
import eu.dnetlib.data.proto.OafProtos;
10 10
import eu.dnetlib.data.proto.ResultProtos;
11
import eu.dnetlib.data.proto.ResultProtos.Result.Context;
11 12
import org.apache.commons.lang3.StringUtils;
12 13
import org.apache.hadoop.mapreduce.Mapper;
13 14

  
......
25 26
    private final static String SCHEMA_ID = "dnet:provenanceActions";
26 27
    private final static String COUNTER_GROUP = "Bulk Tagging";
27 28

  
29
    private final static String ZENODO_COMMUNITY_INDICATOR = "https://zenodo.org/communities/";
30

  
28 31
    private String trust = "0.8";
29 32

  
30 33

  
......
36 39

  
37 40
        if(oaf.getDataInfo().getDeletedbyinference()){
38 41
            context.getCounter(COUNTER_GROUP, "deleted by inference").increment(1);
42
            //TODO remove the context associated to Zenodo Communities
39 43
            return null;
40 44
        }
41 45
        //context.getCounter(COUNTER_GROUP, "not deleted by inference").increment(1);
......
49 53
        }
50 54
        //communities contains all the communities to be added as context for the result
51 55
        final Set<String> communities = new HashSet<>();
52

  
56
        final Set<String> subjects = new HashSet<>();
53 57
        oaf.getEntity().getResult().getMetadata().getSubjectList().stream()
54 58
                .map(subject -> subject.getValue())
55 59
                .filter(StringUtils::isNotBlank)
56 60
                .map(String::toLowerCase)
57 61
                .map(String::trim)
58 62
                .collect(Collectors.toCollection(HashSet::new))
59
                .forEach(s -> communities.addAll(conf.getCommunityForSubjectValue(s)));
63
                .forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
60 64

  
65
        //updating counters for subject matching
66
        subjects.forEach(c->context.getCounter(COUNTER_GROUP, "Matching subject for community " + c).increment(1));
67
        communities.addAll(subjects);
68
        context.getCounter(COUNTER_GROUP,"match found for subjects ").increment(subjects.size());
69

  
70
        final Set<String> datasources = new HashSet<>();
61 71
        oaf.getEntity().getResult().getInstanceList()
62 72
                .stream()
63 73
                .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
64 74
                .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
65 75
                .map(s -> StringUtils.substringAfter(s, "|"))
66 76
                .collect(Collectors.toCollection(HashSet::new))
67
                .forEach(dsId -> communities.addAll(conf.getCommunityForDatasourceValue(dsId)));
77
                .forEach(dsId -> datasources.addAll(conf.getCommunityForDatasourceValue(dsId)));
68 78

  
69
        //TODO: add code for Zenodo Communities
79
        datasources.forEach(c->context.getCounter(COUNTER_GROUP,"Matching datasource for community " + c).increment(1));
80
        communities.addAll(datasources);
81
        context.getCounter(COUNTER_GROUP,"Match found for content providers " ).increment(datasources.size());
70 82

  
83
        final Set<String> czenodo = new HashSet<>();
84
        final ResultProtos.Result.Metadata.Builder mBuilder = builder.getEntityBuilder().getResultBuilder().getMetadataBuilder();
85
        mBuilder.getContextBuilderList().stream().filter(cBuilder -> cBuilder.getId().contains(ZENODO_COMMUNITY_INDICATOR))
86
                .collect(Collectors.toList())
87
                .forEach(c->czenodo.addAll(conf.getCommunityForZenodoCommunityValue(c.getId().substring(c.getId().lastIndexOf("/")+1).trim())));
88
        czenodo.forEach(c->context.getCounter(COUNTER_GROUP,"Matching Zenodo community for community " + c).increment(1));
89
        context.getCounter(COUNTER_GROUP,"Match found for Zenodo communities " ).increment(czenodo.size());
90

  
91
        communities.addAll(czenodo);
92
        List<Context.Builder> clist = mBuilder.getContextBuilderList().stream()
93
                .filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR))).collect(Collectors.toList());
94

  
95

  
96
        mBuilder.clearContext();
97
        clist.forEach(c->mBuilder.addContext(c));
98

  
99

  
100
        final Map<String, ResultProtos.Result.Context.Builder> cBuilders = Maps.newHashMap();
101

  
102

  
71 103
        if(communities.isEmpty()){
72 104
            context.getCounter(COUNTER_GROUP, "list of communities empty").increment(1);
73 105
        }else{
74 106
            context.getCounter(COUNTER_GROUP, "list of communities has values!").increment(1);
75 107
        }
76 108

  
77
        final ResultProtos.Result.Metadata.Builder mBuilder = builder.getEntityBuilder().getResultBuilder().getMetadataBuilder();
78

  
79
        final Map<String, ResultProtos.Result.Context.Builder> cBuilders = Maps.newHashMap();
80 109
        mBuilder.getContextBuilderList().forEach(cBuilder -> {
81 110
            cBuilders.put(cBuilder.getId(), cBuilder);
82 111
        });

Also available in: Unified diff