Project

General

Profile

1
from json import JSONEncoder
2
import sys
3
import re
4
from elasticsearch import Elasticsearch
5
from elasticsearch_dsl import *
6
import logging
7
from eu.dnetlib.util import get_index_properties
8
import traceback
9

    
10
import os
11
from os import path
12

    
13

    
14
log = logging.getLogger("scholexplorer-portal")
15

    
16
pid_resolver = {
17
    "pdb": "http://www.rcsb.org/pdb/explore/explore.do?structureId=%s",
18
    "ncbi-n": "http://www.ncbi.nlm.nih.gov/gquery/?term=%s",
19
    "pmid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
20
    "pmcid": "http://www.ncbi.nlm.nih.gov/pmc/articles/%s",
21
    "pubmedid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
22
    "doi": "http://dx.doi.org/%s",
23
    "genbank": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
24
    "nuccore": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
25
    "swiss-prot": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
26
    "arrayexpress": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
27
    "biomodels": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
28
    "bmrb": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
29
    "ena": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
30
    "geo": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
31
    "ensembl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
32
    "mgi": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
33
    "bind": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
34
    "pride": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
35
    "ddbj": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
36
    "bioproject": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
37
    "embl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
38
    "sra": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
39
    "url": "%s"
40
}
41

    
42

    
43
def resolveIdentifier(pid, pid_type):
44
    if pid_type != None:
45
        regex = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b"
46
        if re.match(regex,pid):
47
            log.debug("It should be doi")
48
            pid_type = 'doi'
49
        if pid_type.lower() in pid_resolver:
50
            return pid_resolver[pid_type.lower()] % pid
51
        else:
52
            if pid_type.lower() == 'openaire':
53
                return "https://www.openaire.eu/search/publication?articleId=%s" % pid.replace('oai:dnet:', '')
54
            elif pid_type.lower() == 'url':
55
                return  pid
56
            else:
57
                return "http://identifiers.org/%s:%s" % (pid_type, pid)
58
    return ""
59

    
60

    
61

    
62

    
63

    
64
def create_typology_filter(value):
65
    return Q('match', typology=value)
66

    
67

    
68
def create_pid_type_filter(value):
69
    args = {'localIdentifier.type': value}
70
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
71

    
72

    
73
def create_pid_query(value):
74
    args = {'localIdentifier.id': value}
75
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
76

    
77

    
78
def create_publisher_filter(value):
79
    return Q('match', publisher=value)
80

    
81

    
82
def create_datasource_filter(value):
83
    args = {'datasources.datasourceName': value}
84
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
85

    
86

    
87
class DLIESResponseEncoder(JSONEncoder):
88
    def default(self, o):
89
        return o.__dict__
90

    
91

    
92
class DLIESResponse(object):
93
    def __init__(self, facet=None, total=0, hits=[]):
94
        if facet is None:
95
            facet = dict(pid=[], typology=[], datasource=[])
96
        self.facet = facet
97
        self.total = total
98
        self.hits = hits
99

    
100

    
101
class DLIESConnector(object):
102
    def __init__(self):
103
        props = get_index_properties()
104
        self.index_host = [x.strip() for x in props['es_index'].split(',')]
105
        self.client = Elasticsearch(hosts=self.index_host, timeout=600000)
106
        self.index_name = props['api.index']
107

    
108
    def get_main_page_stats(self):
109
        stats = dict(total =int(Search(using=self.client, index=self.index_name+"_scholix").count()/2))
110
        for item in ['dataset', 'publication']:
111
            s= Search(using=self.client, index=self.index_name+"_object").query(Q('match', typology=item))
112
            stats[item] = s.count()
113
        return stats
114

    
115
    def query_by_id(self, id):
116
        s = Search(using=self.client, index=self.index_name+"_object")
117
        s = s.query(create_pid_query(id))
118
        s.aggs.bucket('typologies', 'terms', field='typology')
119
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
120
                                                                              field='datasources.datasourceName')
121
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
122
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
123
                                                                           field='localIdentifier.type')
124
        response = s.execute()
125

    
126
        hits = []
127

    
128
        for index_result in response.hits:
129
            input_source = index_result.__dict__['_d_']
130
            fixed_titles = []
131

    
132
            for ids in input_source.get('localIdentifier', []):
133
                ds = resolveIdentifier(ids['id'], ids['type'])
134
                ids['url'] = ds
135
            for t in input_source.get('title', []):
136
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
137
                    fixed_titles.append(t[1:-1])
138
                else:
139
                    fixed_titles.append(t)
140
            input_source['title'] = fixed_titles
141
            hits.append(input_source)
142

    
143
        pid_types = []
144
        for tag in response.aggs.all_pids.all_types.buckets:
145
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
146

    
147
        datasources = []
148
        for tag in response.aggs.all_datasources.all_names.buckets:
149
            datasources.append(dict(key=tag.key, count=tag.doc_count))
150

    
151
        typologies = []
152
        for tag in response.aggs.typologies.buckets:
153
            typologies.append(dict(key=tag.key, count=tag.doc_count))
154

    
155
        publishers = []
156
        for tag in response.aggs.all_publisher.buckets:
157
            if len(tag.key) > 0:
158
                publishers.append(dict(key=tag.key, count=tag.doc_count))
159

    
160
        return DLIESResponse(total=response.hits.total,
161
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
162
                                        publishers=publishers), hits=hits)
163

    
164
    def simple_query(self, textual_query, start=None, end=None, user_filter=None):
165
        s = Search(using=self.client, index=self.index_name+"_object")
166
        if not textual_query  == '*':
167
            q = Q("multi_match", query=textual_query, fields=['title', 'abstract'])
168
        else:
169
            q = Q()
170
        s.aggs.bucket('typologies', 'terms', field='typology')
171
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
172
                                                                              field='datasources.datasourceName')
173
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
174

    
175
        filter_queries = []
176
        if user_filter is not None and len(user_filter) > 0:
177
            for f in user_filter.split('__'):
178
                filter_key = f.split('_')[0]
179
                filter_value = f.split('_')[1]
180
                if filter_key == 'typology':
181
                    filter_queries.append(create_typology_filter(filter_value))
182
                elif filter_key == 'datasource':
183
                    filter_queries.append(create_datasource_filter(filter_value))
184
                elif filter_key == 'pidtype':
185
                    filter_queries.append(create_pid_type_filter(filter_value))
186
                elif filter_key == 'publisher':
187
                    filter_queries.append(create_publisher_filter(filter_value))
188

    
189
        if len(filter_queries) > 0:
190
            s = s.query(q).filter(Q('bool', must=filter_queries))
191
        else:
192
            s = s.query(q)
193

    
194
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
195
                                                                           field='localIdentifier.type')
196

    
197
        if start is not None:
198
            if end is None:
199
                end = start + 10
200
            s = s[start:end]
201
        response = s.execute()
202

    
203
        
204

    
205
        hits = []
206

    
207
        for index_result in response.hits:
208
            input_source = index_result.__dict__['_d_']           
209
            fixed_titles = []            
210
            for ids in input_source.get('localIdentifier', []):
211
                ds = resolveIdentifier(ids['id'], ids['type'])
212
                ids['url'] = ds
213

    
214
            if input_source.get('title', []) is not None:           
215
                for t in input_source.get('title', []):
216
                    if len(t) > 0 and t[0] == '"' and t[-1] == '"':
217
                        fixed_titles.append(t[1:-1])
218
                    else:
219
                        fixed_titles.append(t)
220
            else:
221
                fixed_titles.append("title not available")            
222
            input_source['title'] = fixed_titles
223
            hits.append(input_source)
224
        
225
        pid_types = []
226
        for tag in response.aggs.all_pids.all_types.buckets:
227
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
228

    
229
        datasources = []
230
        for tag in response.aggs.all_datasources.all_names.buckets:
231
            datasources.append(dict(key=tag.key, count=tag.doc_count))
232

    
233
        typologies = []
234
        for tag in response.aggs.typologies.buckets:
235
            typologies.append(dict(key=tag.key, count=tag.doc_count))
236

    
237
        publishers = []
238
        for tag in response.aggs.all_publisher.buckets:
239
            if len(tag.key) > 0:
240
                publishers.append(dict(key=tag.key, count=tag.doc_count))
241

    
242
        return DLIESResponse(total=s.count(),
243
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
244
                                        publishers=publishers), hits=hits)
245

    
246
    def related_type(self, object_id, object_type, start=None):
247
        args = {'target.objectType': object_type}
248
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
249
        args_id = {'source.dnetIdentifier': object_id}
250
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
251
        s = Search(using=self.client).index(self.index_name+"_scholix").query(query_for_id & query_type)
252
        if start:
253
            s = s[start:start + 10]
254
       
255
        response = s.execute()
256
        hits = []
257
        for index_hit in response.hits:
258
            current_item = index_hit.__dict__['_d_']
259
            if 'target' in current_item:
260
                ids = []
261
                for item in current_item['target']['identifier']:
262
                    c_it = item
263
                    c_it['url'] = resolveIdentifier(item['identifier'], item['schema'])
264
                    ids .append(c_it)
265
                current_item['target']['identifier'] = ids
266
            hits.append(current_item)
267

    
268
        return hits
269

    
270
    def fix_collectedFrom(self, source, relation):
271
        if relation is None:
272
            return
273
        relSource = relation.get('source')
274
        collectedFrom = relSource.get('collectedFrom',[])
275
        if collectedFrom is not None:
276
            for coll in collectedFrom:
277
                for d in source['datasources']:
278
                    if d['datasourceName'] == coll['provider']['name']:
279
                        d['provisionMode'] = coll['provisionMode']
280
        return source
281

    
282
    def item_by_id(self, id, type=None, start=None):
283
        try:
284
            res = self.client.get(index=self.index_name+"_object",doc_type="_all", id=id, _source=True)
285
            hits = []
286
            input_source = res['_source']
287
            fixed_titles = []
288
            for t in input_source.get('title',[]):
289
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
290
                    fixed_titles.append(t[1:-1])
291
                else:
292
                    fixed_titles.append(t)
293
            input_source['title'] = fixed_titles
294

    
295
            for ids in input_source.get('localIdentifier', []):
296
                ds = resolveIdentifier(ids['id'], ids['type'])
297
                ids['url'] = ds
298
            related_publications = []
299
            related_dataset = []
300
            related_unknown = []            
301

    
302
            rel_source = None
303
            if input_source.get('relatedPublications') > 0:
304
                if 'publication' == type:
305
                    related_publications = self.related_type(id, 'publication', start)
306
                else:
307
                    related_publications = self.related_type(id, 'publication')
308
                if len(related_publications) > 0:
309
                    rel_source = related_publications[0]
310
                else:
311
                    rel_source = {}
312

    
313

    
314

    
315
            if input_source.get('relatedDatasets') > 0:
316
                if 'dataset' == type:
317
                    related_dataset = self.related_type(id, 'dataset', start)
318
                else:
319
                    related_dataset = self.related_type(id, 'dataset')
320
                rel_source = related_dataset[0]
321
            if input_source.get('relatedUnknown') > 0:
322
                if 'unknown' == type:
323
                    related_unknown = self.related_type(id, 'unknown', start)
324
                else:
325
                    related_unknown = self.related_type(id, 'unknown')
326
                rel_source = related_unknown[0]
327

    
328
            input_source = self.fix_collectedFrom(input_source, rel_source)
329
            hits.append(input_source)
330

    
331
            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
332
                             related_unknown=related_unknown))
333

    
334
            return DLIESResponse(total=1, hits=hits)
335
        except Exception as e:            
336
            log.error("Error on getting item ")
337
            log.error(e)            
338
            log.error("on line %i" % sys.exc_info)
339
            return DLIESResponse()
(2-2/5)