Project

General

Profile

1
from json import JSONEncoder
2
from elasticsearch import Elasticsearch
3
from elasticsearch_dsl import *
4

    
5

    
6
def create_typology_filter(value):
7
    return Q('match', typology=value)
8

    
9

    
10
def create_pid_type_filter(value):
11
    args = {'localIdentifier.type': value}
12
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
13

    
14

    
15
def create_datasource_filter(value):
16
    args = {'datasources.datasourceName': value}
17
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
18

    
19

    
20
class DLIESResponseEncoder(JSONEncoder):
21
    def default(self, o):
22
        return o.__dict__
23

    
24

    
25
class DLIESResponse(object):
26
    def __init__(self, facet=None, total=0, hits=[]):
27
        if facet is None:
28
            facet = dict(pid=[], typology=[], datasource=[])
29
        self.facet = facet
30
        self.total = total
31
        self.hits = hits
32

    
33

    
34
class DLIESConnector(object):
35
    def __init__(self, index_host, index_name):
36
        self.index_host = index_host
37
        self.client = Elasticsearch(hosts=[index_host])
38
        self.index_name = index_name
39

    
40
    def simple_query(self, textual_query, start=None, end=None, user_filter=None):
41
        s = Search(using=self.client, index=self.index_name).doc_type('object').sort('-relatedDatasets',
42
                                                                                     '-relatedPublications',
43
                                                                                     '-relatedUnknown')
44
        q = Q('match', _all=textual_query)
45
        s.aggs.bucket('typologies', 'terms', field='typology')
46
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
47
                                                                              field='datasources.datasourceName')
48

    
49
        filter_queries = []
50
        if user_filter is not None and len(user_filter) > 0:
51
            for f in user_filter.split('__'):
52
                filter_key = f.split('_')[0]
53
                filter_value = f.split('_')[1]
54
                if filter_key == 'typology':
55
                    filter_queries.append(create_typology_filter(filter_value))
56
                elif filter_key == 'datasource':
57
                    filter_queries.append(create_datasource_filter(filter_value))
58
                elif filter_key == 'pidtype':
59
                    filter_queries.append(create_pid_type_filter(filter_value))
60

    
61
        if len(filter_queries) > 0:
62
            s = s.query(q).filter(Q('bool', must=filter_queries))
63
        else:
64
            s = s.query(q)
65

    
66
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
67
                                                                           field='localIdentifier.type')
68

    
69
        if start is not None:
70
            if end is None:
71
                end = start + 10
72
            s = s[start:end]
73
        response = s.execute()
74

    
75
        hits = []
76

    
77
        for index_result in response.hits:
78
            hits.append(index_result.__dict__['_d_'])
79

    
80
        pid_types = []
81
        for tag in response.aggs.all_pids.all_types.buckets:
82
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
83

    
84
        datasources = []
85
        for tag in response.aggs.all_datasources.all_names.buckets:
86
            datasources.append(dict(key=tag.key, count=tag.doc_count))
87

    
88
        typologies = []
89
        for tag in response.aggs.typologies.buckets:
90
            typologies.append(dict(key=tag.key, count=tag.doc_count))
91

    
92
        return DLIESResponse(total=response.hits.total,
93
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources), hits=hits)
94

    
95
    def related_type(self, object_id, object_type, start=None):
96
        args = {'target.objectType': object_type}
97
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
98
        args_id = {'source.dnetIdentifier': object_id}
99
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
100
        s = Search(using=self.client).doc_type('scholix').query(query_for_id & query_type)
101
        if start:
102
            s = s[start:start + 10]
103

    
104
        response = s.execute()
105
        hits = []
106

    
107
        for index_hit in response.hits:
108
            hits.append(index_hit.__dict__['_d_'])
109

    
110
        return hits
111

    
112
    def fix_collectedFrom(self, source, relation):
113
        relSource = relation.get('source')
114
        collectedFrom = relSource['collectedFrom']
115
        for coll in collectedFrom:
116
            for d in source['datasources']:
117
                if d['datasourceName'] == coll['provider']['name']:
118
                    d['provisionMode'] = coll['provisionMode']
119
        return source
120

    
121
    def item_by_id(self, id, type=None, start=None):
122
        try:
123
            res = self.client.get(index=self.index_name, doc_type='object', id=id)
124
            hits = []
125
            input_source = res['_source']
126
            related_publications = []
127
            related_dataset = []
128
            related_unknown = []
129

    
130
            rel_source = None
131
            if input_source.get('relatedPublications') > 0:
132
                if 'publication' == type:
133
                    related_publications = self.related_type(id, 'publication', start)
134
                else:
135
                    related_publications = self.related_type(id, 'publication')
136
                rel_source = related_publications[0]
137
            if input_source.get('relatedDatasets') > 0:
138
                if 'dataset' == type:
139
                    related_dataset = self.related_type(id, 'dataset', start)
140
                else:
141
                    related_dataset = self.related_type(id, 'dataset')
142
                rel_source = related_dataset[0]
143
            if input_source.get('relatedUnknown') > 0:
144
                if 'unknown' == type:
145
                    related_unknown = self.related_type(id, 'unknown', start)
146
                else:
147
                    related_unknown = self.related_type(id, 'unknown')
148
                rel_source = related_unknown[0]
149

    
150
            input_source = self.fix_collectedFrom(input_source, rel_source)
151
            hits.append(input_source)
152

    
153
            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
154
                             related_unknown=related_unknown))
155

    
156
            return DLIESResponse(total=1, hits=hits)
157
        except:
158
            return DLIESResponse()
(2-2/2)