Project

General

Profile

1
from json import JSONEncoder
2
from elasticsearch import Elasticsearch
3
from elasticsearch_dsl import *
4

    
5

    
6
def create_typology_filter(value):
7
    return Q('match', typology=value)
8

    
9

    
10
def create_pid_type_filter(value):
11
    args = {'localIdentifier.type': value}
12
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
13

    
14

    
15
def create_datasource_filter(value):
16
    args = {'datasources.datasourceName': value}
17
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
18

    
19

    
20
class DLIESResponseEncoder(JSONEncoder):
21
    def default(self, o):
22
        return o.__dict__
23

    
24

    
25
class DLIESResponse(object):
26
    def __init__(self, facet=None, total=0, hits=[]):
27
        if facet is None:
28
            facet = dict(pid=[], typology=[], datasource=[])
29
        self.facet = facet
30
        self.total = total
31
        self.hits = hits
32

    
33

    
34
class DLIESConnector(object):
35
    def __init__(self, index_host, index_name):
36
        self.index_host = index_host
37
        self.client = Elasticsearch(hosts=[index_host])
38
        self.index_name = index_name
39

    
40
    def simple_query(self, textual_query, start=None, end=None, filter_key=None, filter_value=None):
41
        s = Search(using=self.client, index=self.index_name).doc_type('object').sort('-relatedDatasets',
42
                                                                                     '-relatedPublications',
43
                                                                                     '-relatedUnknown')
44
        q = Q('match', _all=textual_query)
45
        s.aggs.bucket('typologies', 'terms', field='typology')
46
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
47
                                                                              field='datasources.datasourceName')
48

    
49
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
50
                                                                           field='localIdentifier.type')
51

    
52
        if filter_key is not None and len(filter_key) > 0:
53
            if filter_key == 'typology':
54
                s = s.query(q).filter(create_typology_filter('dataset'))
55
            elif filter_key == 'datasource':
56
                s = s.query(q).filter(create_datasource_filter(filter_value))
57
            elif filter_key == 'pidtype':
58
                s = s.query(q).filter(create_pid_type_filter(filter_value))
59
            else:
60
                s = s.query(q)
61
        else:
62
            s = s.query(q)
63

    
64
        if start is not None:
65
            if end is None:
66
                end = start + 10
67
            s = s[start:end]
68
        response = s.execute()
69

    
70
        hits = []
71

    
72
        for index_result in response.hits:
73
            hits.append(index_result.__dict__['_d_'])
74

    
75
        pid_types = []
76
        for tag in response.aggs.all_pids.all_types.buckets:
77
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
78

    
79
        datasources = []
80
        for tag in response.aggs.all_datasources.all_names.buckets:
81
            datasources.append(dict(key=tag.key, count=tag.doc_count))
82

    
83
        typologies = []
84
        for tag in response.aggs.typologies.buckets:
85
            typologies.append(dict(key=tag.key, count=tag.doc_count))
86

    
87
        return DLIESResponse(total=response.hits.total,
88
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources), hits=hits)
89

    
90
    def related_type(self, object_id, object_type, start=None):
91
        args = {'target.objectType': object_type}
92
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
93
        args_id = {'source.dnetIdentifier': object_id}
94
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
95
        s = Search(using=self.client).doc_type('scholix').query(query_for_id & query_type)
96
        if start:
97
            s = s[start:start + 10]
98

    
99
        response = s.execute()
100
        hits = []
101

    
102
        for index_hit in response.hits:
103
            hits.append(index_hit.__dict__['_d_'])
104

    
105
        return hits
106

    
107
    def fix_collectedFrom(self, source, relation):
108
        relSource = relation.get('source')
109
        collectedFrom = relSource['collectedFrom']
110
        for coll in collectedFrom:
111
            for d in source['datasources']:
112
                if d['datasourceName'] == coll['provider']['name']:
113
                    d['provisionMode'] = coll['provisionMode']
114
        return source
115

    
116
    def item_by_id(self, id, type=None, start=None):
117
        try:
118
            res = self.client.get(index=self.index_name, doc_type='object', id=id)
119
            hits = []
120
            input_source = res['_source']
121
            related_publications = []
122
            related_dataset = []
123
            related_unknown = []
124

    
125
            rel_source = None
126
            if input_source.get('relatedPublications') > 0:
127
                if 'publication' == type:
128
                    related_publications = self.related_type(id, 'publication', start)
129
                else:
130
                    related_publications = self.related_type(id, 'publication')
131
                rel_source = related_publications[0]
132
            if input_source.get('relatedDatasets') > 0:
133
                if 'dataset' == type:
134
                    related_dataset = self.related_type(id, 'dataset', start)
135
                else:
136
                    related_dataset = self.related_type(id, 'dataset')
137
                rel_source = related_dataset[0]
138
            if input_source.get('relatedUnknown') > 0:
139
                if 'unknown' == type:
140
                    related_unknown = self.related_type(id, 'unknown', start)
141
                else:
142
                    related_unknown = self.related_type(id, 'unknown')
143
                rel_source = related_unknown[0]
144

    
145
            input_source = self.fix_collectedFrom(input_source, rel_source)
146
            hits.append(input_source)
147

    
148
            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
149
                             related_unknown=related_unknown))
150

    
151
            return DLIESResponse(total=1, hits=hits)
152
        except:
153
            return DLIESResponse()
154

    
155
# connector = DLIESConnector('node0-d-dli.d4science.org', 'dli')
156
# res = connector.item_by_id('70|r3d100010134::819365f759cbc38c0362fe8bc684ee7b','dataset',100)
157
# for hit in res.hits[1]['related_dataset']:
158
#     print hit['dnetIdTarget']
159

    
160

    
161
# res = connector.item_by_id('70|r3d100010134::819365f759cbc38c0362fe8bc684ee7b','dataset',1000)
162
# for hit in res.hits[1]['related_dataset']:
163
#     print hit['dnetIdTarget']
164
# print json.dumps(res, cls=DLIESResponseEncoder, indent=2)
(2-2/14)