Project

General

Profile

1
from json import JSONEncoder
2
import sys
3
import re
4
from elasticsearch import Elasticsearch
5
from elasticsearch_dsl import *
6
import logging
7
from eu.dnetlib.util import get_index_properties
8

    
9
import os
10
from os import path
11

    
12

    
13
log = logging.getLogger("scholexplorer-portal")
14

    
15
pid_resolver = {
16
    "pdb": "http://www.rcsb.org/pdb/explore/explore.do?structureId=%s",
17
    "ncbi-n": "http://www.ncbi.nlm.nih.gov/gquery/?term=%s",
18
    "pmid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
19
    "pmcid": "http://www.ncbi.nlm.nih.gov/pmc/articles/%s",
20
    "pubmedid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
21
    "doi": "http://dx.doi.org/%s",
22
    "genbank": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
23
    "nuccore": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
24
    "swiss-prot": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
25
    "arrayexpress": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
26
    "biomodels": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
27
    "bmrb": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
28
    "ena": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
29
    "geo": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
30
    "ensembl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
31
    "mgi": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
32
    "bind": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
33
    "pride": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
34
    "ddbj": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
35
    "bioproject": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
36
    "embl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
37
    "sra": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
38
}
39

    
40

    
41
def resolveIdentifier(pid, pid_type):
42
    if pid_type != None:
43
        regex = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b"
44
        if re.match(regex,pid):
45
            log.debug("It should be doi")
46
            pid_type = 'doi'
47
        if pid_type.lower() in pid_resolver:
48
            return pid_resolver[pid_type.lower()] % pid
49
        else:
50
            if pid_type.lower() == 'openaire':
51
                return "https://www.openaire.eu/search/publication?articleId=%s" % pid.replace('oai:dnet:', '')
52
            elif pid_type.lower() == 'url':
53
                return  pid
54
            else:
55
                return "http://identifiers.org/%s:%s" % (pid_type, pid)
56
    return ""
57

    
58

    
59

    
60

    
61

    
62
def create_typology_filter(value):
63
    return Q('match', typology=value)
64

    
65

    
66
def create_pid_type_filter(value):
67
    args = {'localIdentifier.type': value}
68
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
69

    
70

    
71
def create_pid_query(value):
72
    args = {'localIdentifier.id': value}
73
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
74

    
75

    
76
def create_publisher_filter(value):
77
    return Q('match', publisher=value)
78

    
79

    
80
def create_datasource_filter(value):
81
    args = {'datasources.datasourceName': value}
82
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
83

    
84

    
85
class DLIESResponseEncoder(JSONEncoder):
86
    def default(self, o):
87
        return o.__dict__
88

    
89

    
90
class DLIESResponse(object):
91
    def __init__(self, facet=None, total=0, hits=[]):
92
        if facet is None:
93
            facet = dict(pid=[], typology=[], datasource=[])
94
        self.facet = facet
95
        self.total = total
96
        self.hits = hits
97

    
98

    
99
class DLIESConnector(object):
100
    def __init__(self):
101
        props = get_index_properties()
102
        self.index_host = [x.strip() for x in props['es_index'].split(',')]
103
        self.client = Elasticsearch(hosts=self.index_host)
104
        self.index_name = props['api.index']
105

    
106
    def get_main_page_stats(self):
107
        stats = dict(total =int(Search(using=self.client, index=self.index_name).doc_type('scholix').count()/2))
108
        for item in ['dataset', 'publication']:
109
            s= Search(using=self.client, index=self.index_name).doc_type('object').query(Q('match', typology=item))
110
            stats[item] = s.count()
111
        return stats
112

    
113
    def query_by_id(self, id):
114
        s = Search(using=self.client, index=self.index_name).doc_type('object')
115
        s = s.query(create_pid_query(id))
116
        s.aggs.bucket('typologies', 'terms', field='typology')
117
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
118
                                                                              field='datasources.datasourceName')
119
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
120
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
121
                                                                           field='localIdentifier.type')
122
        response = s.execute()
123

    
124
        hits = []
125

    
126
        for index_result in response.hits:
127
            input_source = index_result.__dict__['_d_']
128
            fixed_titles = []
129

    
130
            for ids in input_source.get('localIdentifier', []):
131
                ds = resolveIdentifier(ids['id'], ids['type'])
132
                ids['url'] = ds
133
            for t in input_source.get('title', []):
134
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
135
                    fixed_titles.append(t[1:-1])
136
                else:
137
                    fixed_titles.append(t)
138
            input_source['title'] = fixed_titles
139
            hits.append(input_source)
140

    
141
        pid_types = []
142
        for tag in response.aggs.all_pids.all_types.buckets:
143
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
144

    
145
        datasources = []
146
        for tag in response.aggs.all_datasources.all_names.buckets:
147
            datasources.append(dict(key=tag.key, count=tag.doc_count))
148

    
149
        typologies = []
150
        for tag in response.aggs.typologies.buckets:
151
            typologies.append(dict(key=tag.key, count=tag.doc_count))
152

    
153
        publishers = []
154
        for tag in response.aggs.all_publisher.buckets:
155
            if len(tag.key) > 0:
156
                publishers.append(dict(key=tag.key, count=tag.doc_count))
157

    
158
        return DLIESResponse(total=response.hits.total,
159
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
160
                                        publishers=publishers), hits=hits)
161

    
162
    def simple_query(self, textual_query, start=None, end=None, user_filter=None):
163
        s = Search(using=self.client, index=self.index_name).doc_type('object')
164

    
165
        if not textual_query  == '*':
166
            q = Q('match', _all=textual_query)
167
        else:
168
            q = Q()
169
        s.aggs.bucket('typologies', 'terms', field='typology')
170
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
171
                                                                              field='datasources.datasourceName')
172
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
173

    
174
        filter_queries = []
175
        if user_filter is not None and len(user_filter) > 0:
176
            for f in user_filter.split('__'):
177
                filter_key = f.split('_')[0]
178
                filter_value = f.split('_')[1]
179
                if filter_key == 'typology':
180
                    filter_queries.append(create_typology_filter(filter_value))
181
                elif filter_key == 'datasource':
182
                    filter_queries.append(create_datasource_filter(filter_value))
183
                elif filter_key == 'pidtype':
184
                    filter_queries.append(create_pid_type_filter(filter_value))
185
                elif filter_key == 'publisher':
186
                    filter_queries.append(create_publisher_filter(filter_value))
187

    
188
        if len(filter_queries) > 0:
189
            s = s.query(q).filter(Q('bool', must=filter_queries))
190
        else:
191
            s = s.query(q)
192

    
193
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
194
                                                                           field='localIdentifier.type')
195

    
196
        if start is not None:
197
            if end is None:
198
                end = start + 10
199
            s = s[start:end]
200
        response = s.execute()
201

    
202
        hits = []
203

    
204
        for index_result in response.hits:
205
            input_source = index_result.__dict__['_d_']
206
            fixed_titles = []
207

    
208
            for ids in input_source.get('localIdentifier', []):
209
                ds = resolveIdentifier(ids['id'], ids['type'])
210
                ids['url'] = ds
211
            for t in input_source.get('title', []):
212
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
213
                    fixed_titles.append(t[1:-1])
214
                else:
215
                    fixed_titles.append(t)
216
            input_source['title'] = fixed_titles
217
            hits.append(input_source)
218

    
219
        pid_types = []
220
        for tag in response.aggs.all_pids.all_types.buckets:
221
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
222

    
223
        datasources = []
224
        for tag in response.aggs.all_datasources.all_names.buckets:
225
            datasources.append(dict(key=tag.key, count=tag.doc_count))
226

    
227
        typologies = []
228
        for tag in response.aggs.typologies.buckets:
229
            typologies.append(dict(key=tag.key, count=tag.doc_count))
230

    
231
        publishers = []
232
        for tag in response.aggs.all_publisher.buckets:
233
            if len(tag.key) > 0:
234
                publishers.append(dict(key=tag.key, count=tag.doc_count))
235

    
236
        return DLIESResponse(total=response.hits.total,
237
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
238
                                        publishers=publishers), hits=hits)
239

    
240
    def related_type(self, object_id, object_type, start=None):
241
        args = {'target.objectType': object_type}
242
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
243
        args_id = {'source.dnetIdentifier': object_id}
244
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
245
        s = Search(using=self.client).index(self.index_name).doc_type('scholix').query(query_for_id & query_type)
246
        if start:
247
            s = s[start:start + 10]
248

    
249
        response = s.execute()
250
        hits = []
251

    
252
        for index_hit in response.hits:
253
            current_item = index_hit.__dict__['_d_']
254
            if 'target' in current_item:
255
                ids = []
256
                for item in current_item['target']['identifier']:
257
                    c_it = item
258
                    c_it['url'] = resolveIdentifier(item['identifier'], item['schema'])
259
                    ids .append(c_it)
260
                current_item['target']['identifier'] = ids
261
            hits.append(current_item)
262

    
263
        return hits
264

    
265
    def fix_collectedFrom(self, source, relation):
266
        if relation is None:
267
            return
268
        relSource = relation.get('source')
269
        collectedFrom = relSource.get('collectedFrom',[])
270
        for coll in collectedFrom:
271
            for d in source['datasources']:
272
                if d['datasourceName'] == coll['provider']['name']:
273
                    d['provisionMode'] = coll['provisionMode']
274
        return source
275

    
276
    def item_by_id(self, id, type=None, start=None):
277
        try:
278
            res = self.client.get(index=self.index_name, doc_type='object', id=id)
279
            hits = []
280
            input_source = res['_source']
281
            fixed_titles = []
282
            for t in input_source.get('title',[]):
283
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
284
                    fixed_titles.append(t[1:-1])
285
                else:
286
                    fixed_titles.append(t)
287
            input_source['title'] = fixed_titles
288

    
289
            for ids in input_source.get('localIdentifier', []):
290
                ds = resolveIdentifier(ids['id'], ids['type'])
291
                ids['url'] = ds
292
            related_publications = []
293
            related_dataset = []
294
            related_unknown = []
295

    
296
            rel_source = None
297
            if input_source.get('relatedPublications') > 0:
298
                if 'publication' == type:
299
                    related_publications = self.related_type(id, 'publication', start)
300
                else:
301
                    related_publications = self.related_type(id, 'publication')
302
                if len(related_publications) > 0:
303
                    rel_source = related_publications[0]
304
                else:
305
                    rel_source = {}
306
            if input_source.get('relatedDatasets') > 0:
307
                if 'dataset' == type:
308
                    related_dataset = self.related_type(id, 'dataset', start)
309
                else:
310
                    related_dataset = self.related_type(id, 'dataset')
311
                rel_source = related_dataset[0]
312
            if input_source.get('relatedUnknown') > 0:
313
                if 'unknown' == type:
314
                    related_unknown = self.related_type(id, 'unknown', start)
315
                else:
316
                    related_unknown = self.related_type(id, 'unknown')
317
                rel_source = related_unknown[0]
318

    
319
            input_source = self.fix_collectedFrom(input_source, rel_source)
320
            hits.append(input_source)
321

    
322
            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
323
                             related_unknown=related_unknown))
324

    
325
            return DLIESResponse(total=1, hits=hits)
326
        except Exception as e:
327
            log.error("Error on getting item ")
328
            log.error(e)
329
            #log.error("on line %i" % sys.exc_info)
330
            return DLIESResponse()
(2-2/5)