Project

General

Profile

1
from json import JSONEncoder
2

    
3
import sys
4
from elasticsearch import Elasticsearch
5
from elasticsearch_dsl import *
6

    
7
import os
8
from os import path
9

    
10

    
11
def get_property():
12

    
13

    
14
    f = open(path.join(os.path.dirname(os.path.realpath(__file__)), '../../api.properties'))
15
    p = {}
16
    for line in f:
17
        data = line.strip().split("=")
18
        p[data[0].strip()] = data[1].strip()
19
    return p
20

    
21

    
22
def create_typology_filter(value):
23
    return Q('match', typology=value)
24

    
25

    
26
def create_pid_type_filter(value):
27
    args = {'localIdentifier.type': value}
28
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
29

    
30

    
31
def create_publisher_filter(value):
32
    return Q('match', publisher=value)
33

    
34

    
35
def create_datasource_filter(value):
36
    args = {'datasources.datasourceName': value}
37
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
38

    
39

    
40
class DLIESResponseEncoder(JSONEncoder):
41
    def default(self, o):
42
        return o.__dict__
43

    
44

    
45
class DLIESResponse(object):
46
    def __init__(self, facet=None, total=0, hits=[]):
47
        if facet is None:
48
            facet = dict(pid=[], typology=[], datasource=[])
49
        self.facet = facet
50
        self.total = total
51
        self.hits = hits
52

    
53

    
54
class DLIESConnector(object):
55
    def __init__(self):
56
        props = get_property()
57
        self.index_host = [x.strip() for x in props['es_index'].split(',')]
58
        self.client = Elasticsearch(hosts=self.index_host)
59
        self.index_name = props['api.index']
60

    
61
    def simple_query(self, textual_query, start=None, end=None, user_filter=None):
62
        s = Search(using=self.client, index=self.index_name).doc_type('object')
63
        q = Q('match', _all=textual_query)
64
        s.aggs.bucket('typologies', 'terms', field='typology')
65
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
66
                                                                              field='datasources.datasourceName')
67
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
68

    
69
        filter_queries = []
70
        if user_filter is not None and len(user_filter) > 0:
71
            for f in user_filter.split('__'):
72
                filter_key = f.split('_')[0]
73
                filter_value = f.split('_')[1]
74
                if filter_key == 'typology':
75
                    filter_queries.append(create_typology_filter(filter_value))
76
                elif filter_key == 'datasource':
77
                    filter_queries.append(create_datasource_filter(filter_value))
78
                elif filter_key == 'pidtype':
79
                    filter_queries.append(create_pid_type_filter(filter_value))
80
                elif filter_key == 'publisher':
81
                    filter_queries.append(create_publisher_filter(filter_value))
82

    
83
        if len(filter_queries) > 0:
84
            s = s.query(q).filter(Q('bool', must=filter_queries))
85
        else:
86
            s = s.query(q)
87

    
88
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
89
                                                                           field='localIdentifier.type')
90

    
91
        if start is not None:
92
            if end is None:
93
                end = start + 10
94
            s = s[start:end]
95
        response = s.execute()
96

    
97
        hits = []
98

    
99
        for index_result in response.hits:
100
            hits.append(index_result.__dict__['_d_'])
101

    
102
        pid_types = []
103
        for tag in response.aggs.all_pids.all_types.buckets:
104
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
105

    
106
        datasources = []
107
        for tag in response.aggs.all_datasources.all_names.buckets:
108
            datasources.append(dict(key=tag.key, count=tag.doc_count))
109

    
110
        typologies = []
111
        for tag in response.aggs.typologies.buckets:
112
            typologies.append(dict(key=tag.key, count=tag.doc_count))
113

    
114
        publishers = []
115
        for tag in response.aggs.all_publisher.buckets:
116
            if len(tag.key) > 0:
117
                publishers.append(dict(key=tag.key, count=tag.doc_count))
118

    
119
        return DLIESResponse(total=response.hits.total,
120
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
121
                                        publishers=publishers), hits=hits)
122

    
123
    def related_type(self, object_id, object_type, start=None):
124
        args = {'target.objectType': object_type}
125
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
126
        args_id = {'source.dnetIdentifier': object_id}
127
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
128
        s = Search(using=self.client).index(self.index_name).doc_type('scholix').query(query_for_id & query_type)
129
        if start:
130
            s = s[start:start + 10]
131

    
132
        response = s.execute()
133
        hits = []
134

    
135
        for index_hit in response.hits:
136
            hits.append(index_hit.__dict__['_d_'])
137

    
138
        return hits
139

    
140
    def fix_collectedFrom(self, source, relation):
141
        relSource = relation.get('source')
142
        collectedFrom = relSource['collectedFrom']
143
        for coll in collectedFrom:
144
            for d in source['datasources']:
145
                if d['datasourceName'] == coll['provider']['name']:
146
                    d['provisionMode'] = coll['provisionMode']
147
        return source
148

    
149
    def item_by_id(self, id, type=None, start=None):
150
        try:
151
            res = self.client.get(index=self.index_name, doc_type='object', id=id)
152
            hits = []
153
            input_source = res['_source']
154
            related_publications = []
155
            related_dataset = []
156
            related_unknown = []
157

    
158
            rel_source = None
159
            if input_source.get('relatedPublications') > 0:
160
                if 'publication' == type:
161
                    related_publications = self.related_type(id, 'publication', start)
162
                else:
163
                    related_publications = self.related_type(id, 'publication')
164
                rel_source = related_publications[0]
165
            if input_source.get('relatedDatasets') > 0:
166
                if 'dataset' == type:
167
                    related_dataset = self.related_type(id, 'dataset', start)
168
                else:
169
                    related_dataset = self.related_type(id, 'dataset')
170
                rel_source = related_dataset[0]
171
            if input_source.get('relatedUnknown') > 0:
172
                if 'unknown' == type:
173
                    related_unknown = self.related_type(id, 'unknown', start)
174
                else:
175
                    related_unknown = self.related_type(id, 'unknown')
176
                rel_source = related_unknown[0]
177

    
178
            input_source = self.fix_collectedFrom(input_source, rel_source)
179
            hits.append(input_source)
180

    
181
            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
182
                             related_unknown=related_unknown))
183

    
184
            return DLIESResponse(total=1, hits=hits)
185
        except Exception as e:
186
            print "Error on getting item "
187
            print e
188
            print "on line %i"% sys.exc_traceback.tb_lineno
189
            return DLIESResponse()
(2-2/2)