Project

General

Profile

1
from json import JSONEncoder
2

    
3
import sys
4

    
5
import re
6
from elasticsearch import Elasticsearch
7
from elasticsearch_dsl import *
8

    
9
import os
10
from os import path
11

    
12
pid_resolver = {
13
    "pdb": "http://www.rcsb.org/pdb/explore/explore.do?structureId=%s",
14
    "ncbi-n": "http://www.ncbi.nlm.nih.gov/gquery/?term=%s",
15
    "pmid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
16
    "pmcid": "http://www.ncbi.nlm.nih.gov/pmc/articles/%s",
17
    "pubmedid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
18
    "doi": "http://dx.doi.org/%s",
19
    "genbank": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
20
    "nuccore": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
21
    "swiss-prot": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
22
    "arrayexpress": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
23
    "biomodels": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
24
    "bmrb": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
25
    "ena": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
26
    "geo": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
27
    "ensembl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
28
    "mgi": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
29
    "bind": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
30
    "pride": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
31
    "ddbj": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
32
    "bioproject": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
33
    "embl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
34
    "sra": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
35
}
36

    
37

    
38
def resolveIdentifier(pid, pid_type):
39
    if pid_type != None:
40
        regex = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b"
41
        if re.match(regex,pid):
42
            print "It should be doi"
43
            pid_type = 'doi'
44
        if pid_type.lower() in pid_resolver:
45
            return pid_resolver[pid_type.lower()] % pid
46
        else:
47
            if pid_type.lower() == 'openaire':
48
                return "https://www.openaire.eu/search/publication?articleId=%s" % pid.replace('oai:dnet:', '')
49
            elif pid_type.lower() == 'url':
50
                return  pid
51
            else:
52
                return "http://identifiers.org/%s:%s" % (pid_type, pid)
53
    return ""
54

    
55

    
56
def get_property():
57
    f = open(path.join(os.path.dirname(os.path.realpath(__file__)), '../../api.properties'))
58
    p = {}
59
    for line in f:
60
        data = line.strip().split("=")
61
        p[data[0].strip()] = data[1].strip()
62
    return p
63

    
64

    
65
def create_typology_filter(value):
66
    return Q('match', typology=value)
67

    
68

    
69
def create_pid_type_filter(value):
70
    args = {'localIdentifier.type': value}
71
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
72

    
73

    
74
def create_pid_query(value):
75
    args = {'localIdentifier.id': value}
76
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
77

    
78

    
79
def create_publisher_filter(value):
80
    return Q('match', publisher=value)
81

    
82

    
83
def create_datasource_filter(value):
84
    args = {'datasources.datasourceName': value}
85
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
86

    
87

    
88
class DLIESResponseEncoder(JSONEncoder):
89
    def default(self, o):
90
        return o.__dict__
91

    
92

    
93
class DLIESResponse(object):
94
    def __init__(self, facet=None, total=0, hits=[]):
95
        if facet is None:
96
            facet = dict(pid=[], typology=[], datasource=[])
97
        self.facet = facet
98
        self.total = total
99
        self.hits = hits
100

    
101

    
102
class DLIESConnector(object):
103
    def __init__(self):
104
        props = get_property()
105
        self.index_host = [x.strip() for x in props['es_index'].split(',')]
106
        self.client = Elasticsearch(hosts=self.index_host)
107
        self.index_name = props['api.index']
108

    
109
    def get_main_page_stats(self):
110
        stats = dict(total =Search(using=self.client, index=self.index_name).doc_type('scholix').execute().hits.total/2)
111
        for item in ['dataset', 'publication']:
112
            s= Search(using=self.client, index=self.index_name).doc_type('object').query(Q('match', typology=item))
113
            stats[item] = s.execute().hits.total
114
        return stats
115

    
116
    def query_by_id(self, id):
117
        s = Search(using=self.client, index=self.index_name).doc_type('object')
118
        s = s.query(create_pid_query(id))
119
        s.aggs.bucket('typologies', 'terms', field='typology')
120
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
121
                                                                              field='datasources.datasourceName')
122
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
123
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
124
                                                                           field='localIdentifier.type')
125
        response = s.execute()
126

    
127
        hits = []
128

    
129
        for index_result in response.hits:
130
            input_source = index_result.__dict__['_d_']
131
            fixed_titles = []
132

    
133
            for ids in input_source.get('localIdentifier', []):
134
                ds = resolveIdentifier(ids['id'], ids['type'])
135
                ids['url'] = ds
136
            for t in input_source.get('title', []):
137
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
138
                    fixed_titles.append(t[1:-1])
139
                else:
140
                    fixed_titles.append(t)
141
            input_source['title'] = fixed_titles
142
            hits.append(input_source)
143

    
144
        pid_types = []
145
        for tag in response.aggs.all_pids.all_types.buckets:
146
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
147

    
148
        datasources = []
149
        for tag in response.aggs.all_datasources.all_names.buckets:
150
            datasources.append(dict(key=tag.key, count=tag.doc_count))
151

    
152
        typologies = []
153
        for tag in response.aggs.typologies.buckets:
154
            typologies.append(dict(key=tag.key, count=tag.doc_count))
155

    
156
        publishers = []
157
        for tag in response.aggs.all_publisher.buckets:
158
            if len(tag.key) > 0:
159
                publishers.append(dict(key=tag.key, count=tag.doc_count))
160

    
161
        return DLIESResponse(total=response.hits.total,
162
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
163
                                        publishers=publishers), hits=hits)
164

    
165
    def simple_query(self, textual_query, start=None, end=None, user_filter=None):
166
        s = Search(using=self.client, index=self.index_name).doc_type('object')
167
        q = Q('match', _all=textual_query)
168
        s.aggs.bucket('typologies', 'terms', field='typology')
169
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
170
                                                                              field='datasources.datasourceName')
171
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
172

    
173
        filter_queries = []
174
        if user_filter is not None and len(user_filter) > 0:
175
            for f in user_filter.split('__'):
176
                filter_key = f.split('_')[0]
177
                filter_value = f.split('_')[1]
178
                if filter_key == 'typology':
179
                    filter_queries.append(create_typology_filter(filter_value))
180
                elif filter_key == 'datasource':
181
                    filter_queries.append(create_datasource_filter(filter_value))
182
                elif filter_key == 'pidtype':
183
                    filter_queries.append(create_pid_type_filter(filter_value))
184
                elif filter_key == 'publisher':
185
                    filter_queries.append(create_publisher_filter(filter_value))
186

    
187
        if len(filter_queries) > 0:
188
            s = s.query(q).filter(Q('bool', must=filter_queries))
189
        else:
190
            s = s.query(q)
191

    
192
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
193
                                                                           field='localIdentifier.type')
194

    
195
        if start is not None:
196
            if end is None:
197
                end = start + 10
198
            s = s[start:end]
199
        response = s.execute()
200

    
201
        hits = []
202

    
203
        for index_result in response.hits:
204
            input_source = index_result.__dict__['_d_']
205
            fixed_titles = []
206

    
207
            for ids in input_source.get('localIdentifier', []):
208
                ds = resolveIdentifier(ids['id'], ids['type'])
209
                ids['url'] = ds
210
            for t in input_source.get('title', []):
211
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
212
                    fixed_titles.append(t[1:-1])
213
                else:
214
                    fixed_titles.append(t)
215
            input_source['title'] = fixed_titles
216
            hits.append(input_source)
217

    
218
        pid_types = []
219
        for tag in response.aggs.all_pids.all_types.buckets:
220
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
221

    
222
        datasources = []
223
        for tag in response.aggs.all_datasources.all_names.buckets:
224
            datasources.append(dict(key=tag.key, count=tag.doc_count))
225

    
226
        typologies = []
227
        for tag in response.aggs.typologies.buckets:
228
            typologies.append(dict(key=tag.key, count=tag.doc_count))
229

    
230
        publishers = []
231
        for tag in response.aggs.all_publisher.buckets:
232
            if len(tag.key) > 0:
233
                publishers.append(dict(key=tag.key, count=tag.doc_count))
234

    
235
        return DLIESResponse(total=response.hits.total,
236
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
237
                                        publishers=publishers), hits=hits)
238

    
239
    def related_type(self, object_id, object_type, start=None):
240
        args = {'target.objectType': object_type}
241
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
242
        args_id = {'source.dnetIdentifier': object_id}
243
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
244
        s = Search(using=self.client).index(self.index_name).doc_type('scholix').query(query_for_id & query_type)
245
        if start:
246
            s = s[start:start + 10]
247

    
248
        response = s.execute()
249
        hits = []
250

    
251
        for index_hit in response.hits:
252
            current_item = index_hit.__dict__['_d_']
253
            if 'target' in current_item:
254
                ids = []
255
                for item in current_item['target']['identifier']:
256
                    c_it = item
257
                    c_it['url'] = resolveIdentifier(item['identifier'], item['schema'])
258
                    ids .append(c_it)
259
                current_item['target']['identifier'] = ids
260
            hits.append(current_item)
261

    
262
        return hits
263

    
264
    def fix_collectedFrom(self, source, relation):
265
        if relation is None:
266
            return
267
        relSource = relation.get('source')
268
        collectedFrom = relSource['collectedFrom']
269
        for coll in collectedFrom:
270
            for d in source['datasources']:
271
                if d['datasourceName'] == coll['provider']['name']:
272
                    d['provisionMode'] = coll['provisionMode']
273
        return source
274

    
275
    def item_by_id(self, id, type=None, start=None):
276
        try:
277
            res = self.client.get(index=self.index_name, doc_type='object', id=id)
278
            hits = []
279
            input_source = res['_source']
280
            fixed_titles = []
281
            for t in input_source.get('title',[]):
282
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
283
                    fixed_titles.append(t[1:-1])
284
                else:
285
                    fixed_titles.append(t)
286
            input_source['title'] = fixed_titles
287

    
288
            for ids in input_source.get('localIdentifier', []):
289
                ds = resolveIdentifier(ids['id'], ids['type'])
290
                ids['url'] = ds
291
            related_publications = []
292
            related_dataset = []
293
            related_unknown = []
294

    
295
            rel_source = None
296
            if input_source.get('relatedPublications') > 0:
297
                if 'publication' == type:
298
                    related_publications = self.related_type(id, 'publication', start)
299
                else:
300
                    related_publications = self.related_type(id, 'publication')
301
                if len(related_publications) > 0:
302
                    rel_source = related_publications[0]
303
                else:
304
                    rel_source = {}
305
            if input_source.get('relatedDatasets') > 0:
306
                if 'dataset' == type:
307
                    related_dataset = self.related_type(id, 'dataset', start)
308
                else:
309
                    related_dataset = self.related_type(id, 'dataset')
310
                rel_source = related_dataset[0]
311
            if input_source.get('relatedUnknown') > 0:
312
                if 'unknown' == type:
313
                    related_unknown = self.related_type(id, 'unknown', start)
314
                else:
315
                    related_unknown = self.related_type(id, 'unknown')
316
                rel_source = related_unknown[0]
317

    
318
            input_source = self.fix_collectedFrom(input_source, rel_source)
319
            hits.append(input_source)
320

    
321
            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
322
                             related_unknown=related_unknown))
323

    
324
            return DLIESResponse(total=1, hits=hits)
325
        except Exception as e:
326
            print "Error on getting item "
327
            print e
328
            print "on line %i" % sys.exc_traceback.tb_lineno
329
            return DLIESResponse()
(2-2/2)