Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.propagation.country.institutionalrepositories;
2

    
3
import com.google.common.base.Splitter;
4
import com.google.common.collect.Lists;
5
import com.google.common.collect.Sets;
6
import com.google.protobuf.InvalidProtocolBufferException;
7
import eu.dnetlib.data.mapreduce.hbase.propagation.Value;
8
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
9
import eu.dnetlib.data.proto.*;
10
import org.apache.commons.collections.MapUtils;
11
import org.apache.hadoop.hbase.client.Result;
12
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
13
import org.apache.hadoop.hbase.mapreduce.TableMapper;
14
import org.apache.hadoop.hbase.util.Bytes;
15
import org.apache.hadoop.io.Text;
16

    
17
import java.io.IOException;
18
import java.util.ArrayList;
19
import java.util.Map;
20
import java.util.Set;
21
import java.util.stream.Collectors;
22

    
23
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.*;
24
import static eu.dnetlib.data.mapreduce.hbase.propagation.Utils.*;
25
/**
26
 * Created by miriam on 17/08/2018.
27
 */
28
public class PropagationCountryFromDsOrgResultMapper extends TableMapper<InstOrgKey, Text> {
29

    
30
    private Text valueOut;
31

    
32
    private Set<String> datasourceTypes = Sets.newHashSet("pubsrepository::institutional");
33
    private Set<String> whiteList = Sets.newHashSet("10|opendoar____::300891a62162b960cf02ce3827bb363c");
34
    private Set<String> blackList = Sets.newHashSet("");
35

    
36
    @Override
37
    protected void setup(final Context context) throws IOException, InterruptedException {
38
        super.setup(context);
39

    
40
        valueOut = new Text();
41

    
42
        datasourceTypes.addAll(getParam(context, "datasource.types"));
43
        whiteList.addAll(getParam(context, "datasource.whitelist"));
44
    }
45

    
46
    @Override
47
    protected void map(final ImmutableBytesWritable keyIn, final Result value, final Context context) throws IOException, InterruptedException {
48

    
49
        final TypeProtos.Type type = OafRowKeyDecoder.decode(keyIn.copyBytes()).getType();
50
        final OafProtos.OafEntity entity = getEntity(value,type);
51
        if (entity != null) {
52
            switch (type) {
53
                case datasource:
54
                    final DatasourceProtos.Datasource datasource = entity.getDatasource();
55
                    final String id = entity.getId();
56
                    if (datasource == null) {
57
                        throw new RuntimeException("oaf type is datasource, but datasource proto is not found in oafproto");
58
                    }
59

    
60
                    String dsType = datasource.getMetadata().getDatasourcetype().getClassid();
61
                    if (datasourceTypes.contains(dsType)) {
62
                        // verify datasource is in blacklist
63
                        if (blackList.contains(id)){
64
                            context.getCounter(COUNTER_PROPAGATION,"blacklisted").increment(1);
65
                            emitNotAllowedDatasource(context,entity.getId());
66

    
67
                        } else {
68
                            emitAllowedDatasource(value, context, entity.getId(), dsType);
69
                        }
70
                    } else {
71
                        // verify datasource is in whiteList
72

    
73
                        if (whiteList.contains(id)) {
74
                            context.getCounter(COUNTER_PROPAGATION,"whitelisted " + id).increment(1);
75
                            emitAllowedDatasource(value,context,entity.getId(),dsType);
76
                        } else {
77
                            emitNotAllowedDatasource(context, entity.getId());
78
                        }
79
                    }
80

    
81
                    break;
82
                case organization:
83
                    OrganizationProtos.Organization organization = entity.getOrganization();
84
                    if (organization == null) {
85
                        throw new RuntimeException("oaf type is organization, but organization proto is not found in oafproto");
86
                    }
87

    
88
                    FieldTypeProtos.Qualifier country = organization.getMetadata().getCountry();
89
                    if (country == null) {
90
                        context.getCounter(COUNTER_PROPAGATION, "country elem does not exists").increment(1);
91
                    } else {
92
                        final Map<byte[], byte[]> ds_org = value.getFamilyMap("datasourceOrganization_provision_isProvidedBy".getBytes());
93
                        if (MapUtils.isNotEmpty(ds_org)) {
94

    
95
                            for (String dsId : ds_org.keySet().stream().map(String::new).collect(Collectors.toList())) {
96

    
97
                                valueOut.set(Value.newInstance(country.getClassid()).toJson());
98
                                context.write(InstOrgKey.organization(dsId), valueOut);
99
                                context.getCounter(COUNTER_PROPAGATION, "country ").increment(1);
100
                            }
101
                        }
102
                    }
103

    
104
                    break;
105
                case result:
106
                    ResultProtos.Result result = entity.getResult();
107

    
108
                    for (ResultProtos.Result.Instance instance : result.getInstanceList()) {
109
                        //todo add check if key is not empty and field is not null
110

    
111
                        String hostedBy = instance.getHostedby().getKey();
112
                        valueOut.set(Value.newInstance(entity.getId()).toJson());
113
                        context.write(InstOrgKey.publication(hostedBy),valueOut);
114
                        context.getCounter(COUNTER_PROPAGATION, "emit hostedby | collectedfrom for publication ").increment(1);
115
                        String collectedFrom = instance.getCollectedfrom().getKey();
116
                        if (!hostedBy.equals(collectedFrom)) {
117
                            context.write(InstOrgKey.publication(collectedFrom), valueOut);
118
                            context.getCounter(COUNTER_PROPAGATION, "emit hostedby | collectedfrom for publication ").increment(1);
119
                        }
120
                    }
121
                    break;
122
            }
123
        }
124
    }
125

    
126
    private void emitNotAllowedDatasource(Context context, String id) throws IOException, InterruptedException {
127
        valueOut.set(Value.newInstance(ZERO).toJson());
128
        context.write(InstOrgKey.datasource(id), valueOut);
129
        context.getCounter(COUNTER_PROPAGATION, "ds Type not in propagation allowed list").increment(1);
130
    }
131

    
132
    private void emitAllowedDatasource(Result value, Context context, String id, String dsType) throws IOException, InterruptedException {
133
        valueOut.set(Value.newInstance(ONE, getTrust(value)).toJson());
134
        context.write(InstOrgKey.datasource(id), valueOut);
135
        context.getCounter(COUNTER_PROPAGATION, String.format("%s in propagation allowed list", dsType)).increment(1);
136
    }
137

    
138

    
139

    
140
    private String getTrust(Result value) throws InvalidProtocolBufferException {
141
        final Map<byte[],byte[]> map = value.getFamilyMap(Bytes.toBytes("datasource"));
142
        final byte[] body = map.get(Bytes.toBytes("body"));
143
        if (body != null){
144
            OafProtos.Oaf oaf = OafProtos.Oaf.parseFrom(body);
145
            return oaf.getDataInfo().getTrust();
146
        }
147
        return null;
148
    }
149

    
150
    private ArrayList<String> getParam(Context context, String s) {
151
        return Lists.newArrayList(Splitter.on(",").omitEmptyStrings().split(context.getConfiguration().get(s, "")));
152
    }
153

    
154
}
(5-5/7)