Project

General

Profile

1
from json import JSONEncoder
2

    
3
import sys
4
from elasticsearch import Elasticsearch
5
from elasticsearch_dsl import *
6

    
7
import os
8
from os import path
9

    
10

    
11
pid_resolver = {
12
    "pdb": "http://www.rcsb.org/pdb/explore/explore.do?structureId=%s",
13
    "ncbi-n": "http://www.ncbi.nlm.nih.gov/gquery/?term=%s",
14
    "pmid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
15
    "pmcid": "http://www.ncbi.nlm.nih.gov/pmc/articles/%s",
16
    "pubmedid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
17
    "doi": "http://dx.doi.org/%s",
18
    "genbank": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
19
    "nuccore": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
20
    "swiss-prot": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
21
    "arrayexpress": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
22
    "biomodels": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
23
    "bmrb": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
24
    "ena": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
25
    "geo": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
26
    "ensembl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
27
    "mgi": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
28
    "bind": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
29
    "pride": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
30
    "ddbj": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
31
    "bioproject": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
32
    "embl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
33
    "sra": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
34
}
35

    
36

    
37
def resolveIdentifier(pid, pid_type):
38
    if pid_type!= None:
39
        if pid_type.lower() in  pid_resolver:
40
            return pid_resolver[pid_type.lower()] % pid
41
        else:
42
            if pid_type.lower() == 'openaire':
43
                return "https://www.openaire.eu/search/publication?articleId=%s"%pid.replace('oai:dnet:','')
44
            else:
45
                return "http://identifiers.org/%s:%s" % (pid_type, pid)
46
    return ""
47

    
48
def get_property():
49
    f = open(path.join(os.path.dirname(os.path.realpath(__file__)), '../../api.properties'))
50
    p = {}
51
    for line in f:
52
        data = line.strip().split("=")
53
        p[data[0].strip()] = data[1].strip()
54
    return p
55

    
56

    
57
def create_typology_filter(value):
58
    return Q('match', typology=value)
59

    
60

    
61
def create_pid_type_filter(value):
62
    args = {'localIdentifier.type': value}
63
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
64

    
65

    
66
def create_publisher_filter(value):
67
    return Q('match', publisher=value)
68

    
69

    
70
def create_datasource_filter(value):
71
    args = {'datasources.datasourceName': value}
72
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
73

    
74

    
75
class DLIESResponseEncoder(JSONEncoder):
76
    def default(self, o):
77
        return o.__dict__
78

    
79

    
80
class DLIESResponse(object):
81
    def __init__(self, facet=None, total=0, hits=[]):
82
        if facet is None:
83
            facet = dict(pid=[], typology=[], datasource=[])
84
        self.facet = facet
85
        self.total = total
86
        self.hits = hits
87

    
88

    
89
class DLIESConnector(object):
90
    def __init__(self):
91
        props = get_property()
92
        self.index_host = [x.strip() for x in props['es_index'].split(',')]
93
        self.client = Elasticsearch(hosts=self.index_host)
94
        self.index_name = props['api.index']
95

    
96
    def simple_query(self, textual_query, start=None, end=None, user_filter=None):
97
        s = Search(using=self.client, index=self.index_name).doc_type('object')
98
        q = Q('match', _all=textual_query)
99
        s.aggs.bucket('typologies', 'terms', field='typology')
100
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
101
                                                                              field='datasources.datasourceName')
102
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
103

    
104
        filter_queries = []
105
        if user_filter is not None and len(user_filter) > 0:
106
            for f in user_filter.split('__'):
107
                filter_key = f.split('_')[0]
108
                filter_value = f.split('_')[1]
109
                if filter_key == 'typology':
110
                    filter_queries.append(create_typology_filter(filter_value))
111
                elif filter_key == 'datasource':
112
                    filter_queries.append(create_datasource_filter(filter_value))
113
                elif filter_key == 'pidtype':
114
                    filter_queries.append(create_pid_type_filter(filter_value))
115
                elif filter_key == 'publisher':
116
                    filter_queries.append(create_publisher_filter(filter_value))
117

    
118
        if len(filter_queries) > 0:
119
            s = s.query(q).filter(Q('bool', must=filter_queries))
120
        else:
121
            s = s.query(q)
122

    
123
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
124
                                                                           field='localIdentifier.type')
125

    
126
        if start is not None:
127
            if end is None:
128
                end = start + 10
129
            s = s[start:end]
130
        response = s.execute()
131

    
132
        hits = []
133

    
134
        for index_result in response.hits:
135
            input_source = index_result.__dict__['_d_']
136
            fixed_titles = []
137

    
138
            for ids in  input_source.get('localIdentifier',[]):
139
                ds = resolveIdentifier(ids['id'], ids['type'])
140
                ids['url'] = ds
141
            for t in input_source.get('title',[]):
142
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
143
                    fixed_titles.append(t[1:-1])
144
                else:
145
                    fixed_titles.append(t)
146
            input_source['title'] = fixed_titles
147
            hits.append(input_source)
148

    
149
        pid_types = []
150
        for tag in response.aggs.all_pids.all_types.buckets:
151
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
152

    
153
        datasources = []
154
        for tag in response.aggs.all_datasources.all_names.buckets:
155
            datasources.append(dict(key=tag.key, count=tag.doc_count))
156

    
157
        typologies = []
158
        for tag in response.aggs.typologies.buckets:
159
            typologies.append(dict(key=tag.key, count=tag.doc_count))
160

    
161
        publishers = []
162
        for tag in response.aggs.all_publisher.buckets:
163
            if len(tag.key) > 0:
164
                publishers.append(dict(key=tag.key, count=tag.doc_count))
165

    
166
        return DLIESResponse(total=response.hits.total,
167
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
168
                                        publishers=publishers), hits=hits)
169

    
170
    def related_type(self, object_id, object_type, start=None):
171
        args = {'target.objectType': object_type}
172
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
173
        args_id = {'source.dnetIdentifier': object_id}
174
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
175
        s = Search(using=self.client).index(self.index_name).doc_type('scholix').query(query_for_id & query_type)
176
        if start:
177
            s = s[start:start + 10]
178

    
179
        response = s.execute()
180
        hits = []
181

    
182
        for index_hit in response.hits:
183
            hits.append(index_hit.__dict__['_d_'])
184

    
185
        return hits
186

    
187
    def fix_collectedFrom(self, source, relation):
188
        if relation is None:
189
            return
190
        relSource = relation.get('source')
191
        collectedFrom = relSource['collectedFrom']
192
        for coll in collectedFrom:
193
            for d in source['datasources']:
194
                if d['datasourceName'] == coll['provider']['name']:
195
                    d['provisionMode'] = coll['provisionMode']
196
        return source
197

    
198
    def item_by_id(self, id, type=None, start=None):
199
        try:
200
            res = self.client.get(index=self.index_name, doc_type='object', id=id)
201
            hits = []
202
            input_source = res['_source']
203
            fixed_titles = []
204
            for t in input_source.get('title'):
205
                if len(t) >0 and t[0]=='"' and t[-1]=='"':
206
                    fixed_titles.append(t[1:-1])
207
                else:
208
                    fixed_titles.append(t)
209
            input_source['title'] = fixed_titles
210

    
211
            for ids in  input_source.get('localIdentifier',[]):
212
                ds = resolveIdentifier(ids['id'], ids['type'])
213
                ids['url'] = ds
214
            related_publications = []
215
            related_dataset = []
216
            related_unknown = []
217

    
218
            rel_source = None
219
            if input_source.get('relatedPublications') > 0:
220
                if 'publication' == type:
221
                    related_publications = self.related_type(id, 'publication', start)
222
                else:
223
                    related_publications = self.related_type(id, 'publication')
224
                if len(related_publications) >0 :
225
                    rel_source = related_publications[0]
226
                else:
227
                    rel_source = {}
228
            if input_source.get('relatedDatasets') > 0:
229
                if 'dataset' == type:
230
                    related_dataset = self.related_type(id, 'dataset', start)
231
                else:
232
                    related_dataset = self.related_type(id, 'dataset')
233
                rel_source = related_dataset[0]
234
            if input_source.get('relatedUnknown') > 0:
235
                if 'unknown' == type:
236
                    related_unknown = self.related_type(id, 'unknown', start)
237
                else:
238
                    related_unknown = self.related_type(id, 'unknown')
239
                rel_source = related_unknown[0]
240

    
241
            input_source = self.fix_collectedFrom(input_source, rel_source)
242
            hits.append(input_source)
243

    
244
            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
245
                             related_unknown=related_unknown))
246

    
247
            return DLIESResponse(total=1, hits=hits)
248
        except Exception as e:
249
            print "Error on getting item "
250
            print e
251
            print "on line %i"% sys.exc_traceback.tb_lineno
252
            return DLIESResponse()
(2-2/2)